commit a199ecaac04720927a449f3127490004487f9ba5 Author: ModelHub XC Date: Wed Apr 22 10:23:57 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Neelectric/Llama-3.1-8B-Instruct_SFT_math00.01 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..002c144 --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/OpenR1-Math-220k_extended_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_math00.01 +tags: +- generated_from_trainer +- trl +- open-r1 +- sft +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_math00.01 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/OpenR1-Math-220k_extended_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/OpenR1-Math-220k_extended_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_math00.01", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_math/runs/abutu5my) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.0.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.4 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..ef1b027 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.651789354410495e+19, + "train_loss": 0.3651657155564753, + "train_runtime": 23763.887, + "train_samples": 86158, + "train_samples_per_second": 10.877, + "train_steps_per_second": 0.68 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e1d9068 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..95a0b6c --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ca69021876378f7e6f06e58d87c9aa4783b3198a60e4d3031d84e0817f94a8c +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..df3edd3 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:152a3acae4c460cbca4df25a5d84ae03692d5304194f77ae4959d1274fd70200 +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..7c222ed --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea6c8355e15eb4494a0a47c3d31fca0904d6cf42056f689dbd4dda795e14ced1 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..40a5613 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b241b41896d73b13bd23291f83d40d4f5b69bd010816791132dc9ae9965ed681 +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..ef1b027 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.651789354410495e+19, + "train_loss": 0.3651657155564753, + "train_runtime": 23763.887, + "train_samples": 86158, + "train_samples_per_second": 10.877, + "train_steps_per_second": 0.68 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..0379ecf --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,145438 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 16155, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00018570102135561745, + "grad_norm": 5.37971305847168, + "learning_rate": 0.0, + "loss": 0.7836, + "mean_token_accuracy": 0.7804478406906128, + "num_tokens": 29330.0, + "step": 1 + }, + { + "epoch": 0.0003714020427112349, + "grad_norm": 4.669919013977051, + "learning_rate": 6.188118811881188e-10, + "loss": 0.8181, + "mean_token_accuracy": 0.7716058492660522, + "num_tokens": 70899.0, + "step": 2 + }, + { + "epoch": 0.0005571030640668524, + "grad_norm": 4.92587947845459, + "learning_rate": 1.2376237623762375e-09, + "loss": 0.7504, + "mean_token_accuracy": 0.7889599800109863, + "num_tokens": 107307.0, + "step": 3 + }, + { + "epoch": 0.0007428040854224698, + "grad_norm": 5.068234920501709, + "learning_rate": 1.8564356435643563e-09, + "loss": 0.8357, + "mean_token_accuracy": 0.7663329839706421, + "num_tokens": 144949.0, + "step": 4 + }, + { + "epoch": 0.0009285051067780873, + "grad_norm": 4.460546493530273, + "learning_rate": 2.475247524752475e-09, + "loss": 0.7289, + "mean_token_accuracy": 0.7947450876235962, + "num_tokens": 185686.0, + "step": 5 + }, + { + "epoch": 0.0011142061281337048, + "grad_norm": 4.990022659301758, + "learning_rate": 3.0940594059405942e-09, + "loss": 0.79, + "mean_token_accuracy": 0.780113935470581, + "num_tokens": 222820.0, + "step": 6 + }, + { + "epoch": 0.0012999071494893223, + "grad_norm": 5.334825038909912, + "learning_rate": 3.7128712871287126e-09, + "loss": 0.8973, + "mean_token_accuracy": 0.7559726238250732, + "num_tokens": 256361.0, + "step": 7 + }, + { + "epoch": 0.0014856081708449396, + "grad_norm": 4.919918060302734, + "learning_rate": 4.331683168316832e-09, + "loss": 0.7988, + "mean_token_accuracy": 0.7754532098770142, + "num_tokens": 292645.0, + "step": 8 + }, + { + "epoch": 0.001671309192200557, + "grad_norm": 5.296793460845947, + "learning_rate": 4.95049504950495e-09, + "loss": 0.7845, + "mean_token_accuracy": 0.7861000299453735, + "num_tokens": 323956.0, + "step": 9 + }, + { + "epoch": 0.0018570102135561746, + "grad_norm": 4.861180305480957, + "learning_rate": 5.569306930693069e-09, + "loss": 0.7787, + "mean_token_accuracy": 0.7861747741699219, + "num_tokens": 360593.0, + "step": 10 + }, + { + "epoch": 0.002042711234911792, + "grad_norm": 5.169854640960693, + "learning_rate": 6.1881188118811884e-09, + "loss": 0.8183, + "mean_token_accuracy": 0.771783173084259, + "num_tokens": 395043.0, + "step": 11 + }, + { + "epoch": 0.0022284122562674096, + "grad_norm": 5.344350814819336, + "learning_rate": 6.806930693069306e-09, + "loss": 0.8, + "mean_token_accuracy": 0.7749800682067871, + "num_tokens": 427005.0, + "step": 12 + }, + { + "epoch": 0.002414113277623027, + "grad_norm": 5.2807793617248535, + "learning_rate": 7.425742574257425e-09, + "loss": 0.8585, + "mean_token_accuracy": 0.7643312215805054, + "num_tokens": 460815.0, + "step": 13 + }, + { + "epoch": 0.0025998142989786446, + "grad_norm": 4.980730056762695, + "learning_rate": 8.044554455445545e-09, + "loss": 0.8245, + "mean_token_accuracy": 0.7699109315872192, + "num_tokens": 498313.0, + "step": 14 + }, + { + "epoch": 0.002785515320334262, + "grad_norm": 5.08199405670166, + "learning_rate": 8.663366336633664e-09, + "loss": 0.766, + "mean_token_accuracy": 0.7859115600585938, + "num_tokens": 532997.0, + "step": 15 + }, + { + "epoch": 0.002971216341689879, + "grad_norm": 5.480612754821777, + "learning_rate": 9.28217821782178e-09, + "loss": 0.8283, + "mean_token_accuracy": 0.7668792605400085, + "num_tokens": 565437.0, + "step": 16 + }, + { + "epoch": 0.003156917363045497, + "grad_norm": 5.019312381744385, + "learning_rate": 9.9009900990099e-09, + "loss": 0.8345, + "mean_token_accuracy": 0.7692123651504517, + "num_tokens": 602683.0, + "step": 17 + }, + { + "epoch": 0.003342618384401114, + "grad_norm": 5.293575763702393, + "learning_rate": 1.051980198019802e-08, + "loss": 0.8038, + "mean_token_accuracy": 0.7760050892829895, + "num_tokens": 635018.0, + "step": 18 + }, + { + "epoch": 0.003528319405756732, + "grad_norm": 4.764909744262695, + "learning_rate": 1.1138613861386138e-08, + "loss": 0.7898, + "mean_token_accuracy": 0.7817322015762329, + "num_tokens": 673225.0, + "step": 19 + }, + { + "epoch": 0.003714020427112349, + "grad_norm": 5.57600736618042, + "learning_rate": 1.1757425742574257e-08, + "loss": 0.8507, + "mean_token_accuracy": 0.7718135118484497, + "num_tokens": 703515.0, + "step": 20 + }, + { + "epoch": 0.0038997214484679664, + "grad_norm": 4.7832932472229, + "learning_rate": 1.2376237623762377e-08, + "loss": 0.7938, + "mean_token_accuracy": 0.7801569700241089, + "num_tokens": 740984.0, + "step": 21 + }, + { + "epoch": 0.004085422469823584, + "grad_norm": 4.816987991333008, + "learning_rate": 1.2995049504950493e-08, + "loss": 0.815, + "mean_token_accuracy": 0.7735476493835449, + "num_tokens": 778875.0, + "step": 22 + }, + { + "epoch": 0.004271123491179201, + "grad_norm": 4.989592552185059, + "learning_rate": 1.3613861386138613e-08, + "loss": 0.7561, + "mean_token_accuracy": 0.7898812890052795, + "num_tokens": 814323.0, + "step": 23 + }, + { + "epoch": 0.004456824512534819, + "grad_norm": 5.031515121459961, + "learning_rate": 1.4232673267326732e-08, + "loss": 0.7932, + "mean_token_accuracy": 0.7785397171974182, + "num_tokens": 847424.0, + "step": 24 + }, + { + "epoch": 0.004642525533890436, + "grad_norm": 4.990327835083008, + "learning_rate": 1.485148514851485e-08, + "loss": 0.7898, + "mean_token_accuracy": 0.7783433198928833, + "num_tokens": 883078.0, + "step": 25 + }, + { + "epoch": 0.004828226555246054, + "grad_norm": 5.0793914794921875, + "learning_rate": 1.5470297029702968e-08, + "loss": 0.7455, + "mean_token_accuracy": 0.7935794591903687, + "num_tokens": 914859.0, + "step": 26 + }, + { + "epoch": 0.005013927576601671, + "grad_norm": 5.082070350646973, + "learning_rate": 1.608910891089109e-08, + "loss": 0.8689, + "mean_token_accuracy": 0.7577963471412659, + "num_tokens": 950286.0, + "step": 27 + }, + { + "epoch": 0.005199628597957289, + "grad_norm": 5.066713333129883, + "learning_rate": 1.6707920792079207e-08, + "loss": 0.8461, + "mean_token_accuracy": 0.76200270652771, + "num_tokens": 985854.0, + "step": 28 + }, + { + "epoch": 0.005385329619312906, + "grad_norm": 5.228720188140869, + "learning_rate": 1.732673267326733e-08, + "loss": 0.8628, + "mean_token_accuracy": 0.7588450908660889, + "num_tokens": 1020807.0, + "step": 29 + }, + { + "epoch": 0.005571030640668524, + "grad_norm": 4.577606678009033, + "learning_rate": 1.7945544554455443e-08, + "loss": 0.7989, + "mean_token_accuracy": 0.7867281436920166, + "num_tokens": 1060442.0, + "step": 30 + }, + { + "epoch": 0.005756731662024141, + "grad_norm": 4.4014201164245605, + "learning_rate": 1.856435643564356e-08, + "loss": 0.7133, + "mean_token_accuracy": 0.8012826442718506, + "num_tokens": 1102414.0, + "step": 31 + }, + { + "epoch": 0.005942432683379758, + "grad_norm": 4.918545722961426, + "learning_rate": 1.9183168316831682e-08, + "loss": 0.8109, + "mean_token_accuracy": 0.7681705951690674, + "num_tokens": 1139815.0, + "step": 32 + }, + { + "epoch": 0.006128133704735376, + "grad_norm": 4.930210113525391, + "learning_rate": 1.98019801980198e-08, + "loss": 0.8151, + "mean_token_accuracy": 0.7755060791969299, + "num_tokens": 1175936.0, + "step": 33 + }, + { + "epoch": 0.006313834726090994, + "grad_norm": 5.085374355316162, + "learning_rate": 2.042079207920792e-08, + "loss": 0.7697, + "mean_token_accuracy": 0.7845894694328308, + "num_tokens": 1210847.0, + "step": 34 + }, + { + "epoch": 0.0064995357474466105, + "grad_norm": 4.699192523956299, + "learning_rate": 2.103960396039604e-08, + "loss": 0.7912, + "mean_token_accuracy": 0.7743337154388428, + "num_tokens": 1247917.0, + "step": 35 + }, + { + "epoch": 0.006685236768802228, + "grad_norm": 4.8368706703186035, + "learning_rate": 2.1658415841584157e-08, + "loss": 0.7303, + "mean_token_accuracy": 0.7971012592315674, + "num_tokens": 1282456.0, + "step": 36 + }, + { + "epoch": 0.006870937790157846, + "grad_norm": 4.427441120147705, + "learning_rate": 2.2277227722772275e-08, + "loss": 0.7807, + "mean_token_accuracy": 0.776124119758606, + "num_tokens": 1326720.0, + "step": 37 + }, + { + "epoch": 0.007056638811513464, + "grad_norm": 5.2804741859436035, + "learning_rate": 2.2896039603960393e-08, + "loss": 0.8027, + "mean_token_accuracy": 0.7773123979568481, + "num_tokens": 1359968.0, + "step": 38 + }, + { + "epoch": 0.0072423398328690805, + "grad_norm": 5.0410237312316895, + "learning_rate": 2.3514851485148515e-08, + "loss": 0.8026, + "mean_token_accuracy": 0.7739579677581787, + "num_tokens": 1395254.0, + "step": 39 + }, + { + "epoch": 0.007428040854224698, + "grad_norm": 4.742325782775879, + "learning_rate": 2.4133663366336632e-08, + "loss": 0.8607, + "mean_token_accuracy": 0.7615556716918945, + "num_tokens": 1434343.0, + "step": 40 + }, + { + "epoch": 0.007613741875580316, + "grad_norm": 4.705336570739746, + "learning_rate": 2.4752475247524754e-08, + "loss": 0.7698, + "mean_token_accuracy": 0.7848179340362549, + "num_tokens": 1473339.0, + "step": 41 + }, + { + "epoch": 0.007799442896935933, + "grad_norm": 4.931818008422852, + "learning_rate": 2.537128712871287e-08, + "loss": 0.847, + "mean_token_accuracy": 0.7637829780578613, + "num_tokens": 1510696.0, + "step": 42 + }, + { + "epoch": 0.00798514391829155, + "grad_norm": 4.654237270355225, + "learning_rate": 2.5990099009900986e-08, + "loss": 0.7971, + "mean_token_accuracy": 0.7793983221054077, + "num_tokens": 1547613.0, + "step": 43 + }, + { + "epoch": 0.008170844939647167, + "grad_norm": 4.819889545440674, + "learning_rate": 2.6608910891089107e-08, + "loss": 0.7989, + "mean_token_accuracy": 0.7743351459503174, + "num_tokens": 1583893.0, + "step": 44 + }, + { + "epoch": 0.008356545961002786, + "grad_norm": 4.782057762145996, + "learning_rate": 2.7227722772277225e-08, + "loss": 0.7828, + "mean_token_accuracy": 0.7793833613395691, + "num_tokens": 1622203.0, + "step": 45 + }, + { + "epoch": 0.008542246982358403, + "grad_norm": 4.709727764129639, + "learning_rate": 2.7846534653465347e-08, + "loss": 0.7159, + "mean_token_accuracy": 0.7993313074111938, + "num_tokens": 1659797.0, + "step": 46 + }, + { + "epoch": 0.00872794800371402, + "grad_norm": 4.861900806427002, + "learning_rate": 2.8465346534653465e-08, + "loss": 0.7901, + "mean_token_accuracy": 0.7808409333229065, + "num_tokens": 1695768.0, + "step": 47 + }, + { + "epoch": 0.008913649025069638, + "grad_norm": 4.925446510314941, + "learning_rate": 2.9084158415841586e-08, + "loss": 0.8199, + "mean_token_accuracy": 0.7824517488479614, + "num_tokens": 1730907.0, + "step": 48 + }, + { + "epoch": 0.009099350046425255, + "grad_norm": 4.782393455505371, + "learning_rate": 2.97029702970297e-08, + "loss": 0.7869, + "mean_token_accuracy": 0.7830203771591187, + "num_tokens": 1768043.0, + "step": 49 + }, + { + "epoch": 0.009285051067780872, + "grad_norm": 5.224529266357422, + "learning_rate": 3.032178217821782e-08, + "loss": 0.8568, + "mean_token_accuracy": 0.7611982226371765, + "num_tokens": 1802606.0, + "step": 50 + }, + { + "epoch": 0.00947075208913649, + "grad_norm": 4.739872455596924, + "learning_rate": 3.0940594059405936e-08, + "loss": 0.7737, + "mean_token_accuracy": 0.7769311666488647, + "num_tokens": 1840610.0, + "step": 51 + }, + { + "epoch": 0.009656453110492107, + "grad_norm": 5.031059265136719, + "learning_rate": 3.155940594059406e-08, + "loss": 0.7597, + "mean_token_accuracy": 0.7891260981559753, + "num_tokens": 1874335.0, + "step": 52 + }, + { + "epoch": 0.009842154131847726, + "grad_norm": 4.886838436126709, + "learning_rate": 3.217821782178218e-08, + "loss": 0.84, + "mean_token_accuracy": 0.7639658451080322, + "num_tokens": 1911332.0, + "step": 53 + }, + { + "epoch": 0.010027855153203343, + "grad_norm": 4.677662372589111, + "learning_rate": 3.27970297029703e-08, + "loss": 0.7485, + "mean_token_accuracy": 0.7903457283973694, + "num_tokens": 1946609.0, + "step": 54 + }, + { + "epoch": 0.01021355617455896, + "grad_norm": 4.8324689865112305, + "learning_rate": 3.3415841584158415e-08, + "loss": 0.8284, + "mean_token_accuracy": 0.7669501900672913, + "num_tokens": 1984286.0, + "step": 55 + }, + { + "epoch": 0.010399257195914578, + "grad_norm": 5.170443058013916, + "learning_rate": 3.403465346534653e-08, + "loss": 0.8285, + "mean_token_accuracy": 0.7715325951576233, + "num_tokens": 2017005.0, + "step": 56 + }, + { + "epoch": 0.010584958217270195, + "grad_norm": 4.880318641662598, + "learning_rate": 3.465346534653466e-08, + "loss": 0.8093, + "mean_token_accuracy": 0.7750586271286011, + "num_tokens": 2051676.0, + "step": 57 + }, + { + "epoch": 0.010770659238625812, + "grad_norm": 4.780539512634277, + "learning_rate": 3.527227722772277e-08, + "loss": 0.7688, + "mean_token_accuracy": 0.7833957672119141, + "num_tokens": 2087898.0, + "step": 58 + }, + { + "epoch": 0.01095636025998143, + "grad_norm": 5.8692827224731445, + "learning_rate": 3.5891089108910886e-08, + "loss": 0.7857, + "mean_token_accuracy": 0.7836030721664429, + "num_tokens": 2114379.0, + "step": 59 + }, + { + "epoch": 0.011142061281337047, + "grad_norm": 4.6847076416015625, + "learning_rate": 3.6509900990099004e-08, + "loss": 0.8058, + "mean_token_accuracy": 0.7756685018539429, + "num_tokens": 2153383.0, + "step": 60 + }, + { + "epoch": 0.011327762302692664, + "grad_norm": 4.705888271331787, + "learning_rate": 3.712871287128712e-08, + "loss": 0.8225, + "mean_token_accuracy": 0.7664388418197632, + "num_tokens": 2188906.0, + "step": 61 + }, + { + "epoch": 0.011513463324048283, + "grad_norm": 4.8699116706848145, + "learning_rate": 3.774752475247525e-08, + "loss": 0.7907, + "mean_token_accuracy": 0.7773610353469849, + "num_tokens": 2223713.0, + "step": 62 + }, + { + "epoch": 0.0116991643454039, + "grad_norm": 4.845118999481201, + "learning_rate": 3.8366336633663365e-08, + "loss": 0.7866, + "mean_token_accuracy": 0.7807093858718872, + "num_tokens": 2258517.0, + "step": 63 + }, + { + "epoch": 0.011884865366759517, + "grad_norm": 5.1363959312438965, + "learning_rate": 3.898514851485148e-08, + "loss": 0.837, + "mean_token_accuracy": 0.7687599658966064, + "num_tokens": 2291597.0, + "step": 64 + }, + { + "epoch": 0.012070566388115135, + "grad_norm": 4.487462520599365, + "learning_rate": 3.96039603960396e-08, + "loss": 0.8075, + "mean_token_accuracy": 0.7763900756835938, + "num_tokens": 2331985.0, + "step": 65 + }, + { + "epoch": 0.012256267409470752, + "grad_norm": 4.623737335205078, + "learning_rate": 4.0222772277227725e-08, + "loss": 0.8268, + "mean_token_accuracy": 0.769075334072113, + "num_tokens": 2371273.0, + "step": 66 + }, + { + "epoch": 0.012441968430826369, + "grad_norm": 4.49259090423584, + "learning_rate": 4.084158415841584e-08, + "loss": 0.9021, + "mean_token_accuracy": 0.7483289241790771, + "num_tokens": 2412548.0, + "step": 67 + }, + { + "epoch": 0.012627669452181987, + "grad_norm": 4.398040294647217, + "learning_rate": 4.146039603960396e-08, + "loss": 0.7611, + "mean_token_accuracy": 0.7860986590385437, + "num_tokens": 2450224.0, + "step": 68 + }, + { + "epoch": 0.012813370473537604, + "grad_norm": 4.725900173187256, + "learning_rate": 4.207920792079208e-08, + "loss": 0.8138, + "mean_token_accuracy": 0.7729812860488892, + "num_tokens": 2486134.0, + "step": 69 + }, + { + "epoch": 0.012999071494893221, + "grad_norm": 4.546443462371826, + "learning_rate": 4.26980198019802e-08, + "loss": 0.8032, + "mean_token_accuracy": 0.7774839997291565, + "num_tokens": 2523441.0, + "step": 70 + }, + { + "epoch": 0.01318477251624884, + "grad_norm": 4.37547492980957, + "learning_rate": 4.3316831683168315e-08, + "loss": 0.7933, + "mean_token_accuracy": 0.7785382270812988, + "num_tokens": 2564095.0, + "step": 71 + }, + { + "epoch": 0.013370473537604457, + "grad_norm": 4.665307998657227, + "learning_rate": 4.393564356435643e-08, + "loss": 0.7351, + "mean_token_accuracy": 0.7943111658096313, + "num_tokens": 2597883.0, + "step": 72 + }, + { + "epoch": 0.013556174558960075, + "grad_norm": 5.182415008544922, + "learning_rate": 4.455445544554455e-08, + "loss": 0.8415, + "mean_token_accuracy": 0.7689023017883301, + "num_tokens": 2631983.0, + "step": 73 + }, + { + "epoch": 0.013741875580315692, + "grad_norm": 4.611040115356445, + "learning_rate": 4.517326732673267e-08, + "loss": 0.7871, + "mean_token_accuracy": 0.7796375155448914, + "num_tokens": 2667982.0, + "step": 74 + }, + { + "epoch": 0.013927576601671309, + "grad_norm": 4.641328811645508, + "learning_rate": 4.5792079207920787e-08, + "loss": 0.8129, + "mean_token_accuracy": 0.7767379283905029, + "num_tokens": 2706648.0, + "step": 75 + }, + { + "epoch": 0.014113277623026927, + "grad_norm": 4.4577789306640625, + "learning_rate": 4.641089108910891e-08, + "loss": 0.7491, + "mean_token_accuracy": 0.7913197875022888, + "num_tokens": 2745517.0, + "step": 76 + }, + { + "epoch": 0.014298978644382544, + "grad_norm": 4.565674304962158, + "learning_rate": 4.702970297029703e-08, + "loss": 0.7775, + "mean_token_accuracy": 0.7837197184562683, + "num_tokens": 2783393.0, + "step": 77 + }, + { + "epoch": 0.014484679665738161, + "grad_norm": 4.989060878753662, + "learning_rate": 4.764851485148515e-08, + "loss": 0.7862, + "mean_token_accuracy": 0.7791422009468079, + "num_tokens": 2815913.0, + "step": 78 + }, + { + "epoch": 0.01467038068709378, + "grad_norm": 4.699174880981445, + "learning_rate": 4.8267326732673265e-08, + "loss": 0.7833, + "mean_token_accuracy": 0.7777369618415833, + "num_tokens": 2851273.0, + "step": 79 + }, + { + "epoch": 0.014856081708449397, + "grad_norm": 5.082072734832764, + "learning_rate": 4.888613861386138e-08, + "loss": 0.8479, + "mean_token_accuracy": 0.765475332736969, + "num_tokens": 2884857.0, + "step": 80 + }, + { + "epoch": 0.015041782729805013, + "grad_norm": 4.462312698364258, + "learning_rate": 4.950495049504951e-08, + "loss": 0.7838, + "mean_token_accuracy": 0.779511034488678, + "num_tokens": 2923582.0, + "step": 81 + }, + { + "epoch": 0.015227483751160632, + "grad_norm": 4.46368408203125, + "learning_rate": 5.0123762376237625e-08, + "loss": 0.7755, + "mean_token_accuracy": 0.7816145420074463, + "num_tokens": 2959092.0, + "step": 82 + }, + { + "epoch": 0.015413184772516249, + "grad_norm": 4.518491268157959, + "learning_rate": 5.074257425742574e-08, + "loss": 0.7453, + "mean_token_accuracy": 0.7924625277519226, + "num_tokens": 2992864.0, + "step": 83 + }, + { + "epoch": 0.015598885793871866, + "grad_norm": 4.374424934387207, + "learning_rate": 5.1361386138613855e-08, + "loss": 0.7805, + "mean_token_accuracy": 0.7788180112838745, + "num_tokens": 3030883.0, + "step": 84 + }, + { + "epoch": 0.015784586815227482, + "grad_norm": 4.397274017333984, + "learning_rate": 5.198019801980197e-08, + "loss": 0.7935, + "mean_token_accuracy": 0.7772831916809082, + "num_tokens": 3070207.0, + "step": 85 + }, + { + "epoch": 0.0159702878365831, + "grad_norm": 4.150781631469727, + "learning_rate": 5.25990099009901e-08, + "loss": 0.7993, + "mean_token_accuracy": 0.773209810256958, + "num_tokens": 3110351.0, + "step": 86 + }, + { + "epoch": 0.01615598885793872, + "grad_norm": 4.276566028594971, + "learning_rate": 5.3217821782178215e-08, + "loss": 0.7363, + "mean_token_accuracy": 0.7918062806129456, + "num_tokens": 3145989.0, + "step": 87 + }, + { + "epoch": 0.016341689879294335, + "grad_norm": 4.225717544555664, + "learning_rate": 5.383663366336633e-08, + "loss": 0.7795, + "mean_token_accuracy": 0.779829740524292, + "num_tokens": 3185075.0, + "step": 88 + }, + { + "epoch": 0.016527390900649953, + "grad_norm": 4.055614948272705, + "learning_rate": 5.445544554455445e-08, + "loss": 0.7822, + "mean_token_accuracy": 0.7774534821510315, + "num_tokens": 3223912.0, + "step": 89 + }, + { + "epoch": 0.016713091922005572, + "grad_norm": 4.180597305297852, + "learning_rate": 5.5074257425742575e-08, + "loss": 0.7871, + "mean_token_accuracy": 0.7777541279792786, + "num_tokens": 3262896.0, + "step": 90 + }, + { + "epoch": 0.016898792943361187, + "grad_norm": 3.9851624965667725, + "learning_rate": 5.569306930693069e-08, + "loss": 0.7705, + "mean_token_accuracy": 0.7797423005104065, + "num_tokens": 3303667.0, + "step": 91 + }, + { + "epoch": 0.017084493964716806, + "grad_norm": 4.362484931945801, + "learning_rate": 5.631188118811881e-08, + "loss": 0.7629, + "mean_token_accuracy": 0.7824056148529053, + "num_tokens": 3339872.0, + "step": 92 + }, + { + "epoch": 0.017270194986072424, + "grad_norm": 4.532744884490967, + "learning_rate": 5.693069306930693e-08, + "loss": 0.7923, + "mean_token_accuracy": 0.776240348815918, + "num_tokens": 3374527.0, + "step": 93 + }, + { + "epoch": 0.01745589600742804, + "grad_norm": 4.217013835906982, + "learning_rate": 5.754950495049505e-08, + "loss": 0.8, + "mean_token_accuracy": 0.7743369340896606, + "num_tokens": 3412747.0, + "step": 94 + }, + { + "epoch": 0.017641597028783658, + "grad_norm": 4.070319652557373, + "learning_rate": 5.816831683168317e-08, + "loss": 0.8165, + "mean_token_accuracy": 0.7682375311851501, + "num_tokens": 3454340.0, + "step": 95 + }, + { + "epoch": 0.017827298050139277, + "grad_norm": 4.2379021644592285, + "learning_rate": 5.878712871287128e-08, + "loss": 0.7969, + "mean_token_accuracy": 0.7732361555099487, + "num_tokens": 3492901.0, + "step": 96 + }, + { + "epoch": 0.01801299907149489, + "grad_norm": 4.534665584564209, + "learning_rate": 5.94059405940594e-08, + "loss": 0.7788, + "mean_token_accuracy": 0.7733286619186401, + "num_tokens": 3527918.0, + "step": 97 + }, + { + "epoch": 0.01819870009285051, + "grad_norm": 4.4654998779296875, + "learning_rate": 6.002475247524753e-08, + "loss": 0.818, + "mean_token_accuracy": 0.7716416716575623, + "num_tokens": 3562813.0, + "step": 98 + }, + { + "epoch": 0.01838440111420613, + "grad_norm": 4.452584743499756, + "learning_rate": 6.064356435643564e-08, + "loss": 0.7794, + "mean_token_accuracy": 0.7789919972419739, + "num_tokens": 3597325.0, + "step": 99 + }, + { + "epoch": 0.018570102135561744, + "grad_norm": 4.548536777496338, + "learning_rate": 6.126237623762376e-08, + "loss": 0.7973, + "mean_token_accuracy": 0.7743390798568726, + "num_tokens": 3630220.0, + "step": 100 + }, + { + "epoch": 0.018755803156917362, + "grad_norm": 4.698631286621094, + "learning_rate": 6.188118811881187e-08, + "loss": 0.7909, + "mean_token_accuracy": 0.7733874917030334, + "num_tokens": 3661537.0, + "step": 101 + }, + { + "epoch": 0.01894150417827298, + "grad_norm": 4.296823024749756, + "learning_rate": 6.25e-08, + "loss": 0.8711, + "mean_token_accuracy": 0.7489355802536011, + "num_tokens": 3700989.0, + "step": 102 + }, + { + "epoch": 0.0191272051996286, + "grad_norm": 4.595815181732178, + "learning_rate": 6.311881188118812e-08, + "loss": 0.7918, + "mean_token_accuracy": 0.7727444171905518, + "num_tokens": 3734247.0, + "step": 103 + }, + { + "epoch": 0.019312906220984215, + "grad_norm": 4.200996398925781, + "learning_rate": 6.373762376237623e-08, + "loss": 0.7698, + "mean_token_accuracy": 0.7785274982452393, + "num_tokens": 3772140.0, + "step": 104 + }, + { + "epoch": 0.019498607242339833, + "grad_norm": 4.403714656829834, + "learning_rate": 6.435643564356436e-08, + "loss": 0.8069, + "mean_token_accuracy": 0.7673711180686951, + "num_tokens": 3808067.0, + "step": 105 + }, + { + "epoch": 0.019684308263695452, + "grad_norm": 4.63593864440918, + "learning_rate": 6.497524752475247e-08, + "loss": 0.7402, + "mean_token_accuracy": 0.7883549928665161, + "num_tokens": 3840358.0, + "step": 106 + }, + { + "epoch": 0.019870009285051067, + "grad_norm": 4.843686103820801, + "learning_rate": 6.55940594059406e-08, + "loss": 0.7887, + "mean_token_accuracy": 0.7752013206481934, + "num_tokens": 3871412.0, + "step": 107 + }, + { + "epoch": 0.020055710306406686, + "grad_norm": 4.370342254638672, + "learning_rate": 6.621287128712872e-08, + "loss": 0.8471, + "mean_token_accuracy": 0.7604113817214966, + "num_tokens": 3909509.0, + "step": 108 + }, + { + "epoch": 0.020241411327762304, + "grad_norm": 4.503335475921631, + "learning_rate": 6.683168316831683e-08, + "loss": 0.7322, + "mean_token_accuracy": 0.7869597673416138, + "num_tokens": 3943644.0, + "step": 109 + }, + { + "epoch": 0.02042711234911792, + "grad_norm": 4.419270992279053, + "learning_rate": 6.745049504950495e-08, + "loss": 0.6959, + "mean_token_accuracy": 0.8025228381156921, + "num_tokens": 3979318.0, + "step": 110 + }, + { + "epoch": 0.020612813370473538, + "grad_norm": 4.5619587898254395, + "learning_rate": 6.806930693069307e-08, + "loss": 0.8143, + "mean_token_accuracy": 0.7668987512588501, + "num_tokens": 4015080.0, + "step": 111 + }, + { + "epoch": 0.020798514391829157, + "grad_norm": 4.92673397064209, + "learning_rate": 6.868811881188119e-08, + "loss": 0.8799, + "mean_token_accuracy": 0.7517000436782837, + "num_tokens": 4048172.0, + "step": 112 + }, + { + "epoch": 0.02098421541318477, + "grad_norm": 4.52579927444458, + "learning_rate": 6.930693069306931e-08, + "loss": 0.7388, + "mean_token_accuracy": 0.7870687246322632, + "num_tokens": 4081863.0, + "step": 113 + }, + { + "epoch": 0.02116991643454039, + "grad_norm": 4.134036540985107, + "learning_rate": 6.992574257425743e-08, + "loss": 0.7679, + "mean_token_accuracy": 0.7790867686271667, + "num_tokens": 4119683.0, + "step": 114 + }, + { + "epoch": 0.02135561745589601, + "grad_norm": 4.784775257110596, + "learning_rate": 7.054455445544554e-08, + "loss": 0.8188, + "mean_token_accuracy": 0.7653440237045288, + "num_tokens": 4150083.0, + "step": 115 + }, + { + "epoch": 0.021541318477251624, + "grad_norm": 3.859217643737793, + "learning_rate": 7.116336633663365e-08, + "loss": 0.7378, + "mean_token_accuracy": 0.7845152616500854, + "num_tokens": 4185733.0, + "step": 116 + }, + { + "epoch": 0.021727019498607242, + "grad_norm": 3.619161367416382, + "learning_rate": 7.178217821782177e-08, + "loss": 0.6997, + "mean_token_accuracy": 0.799192488193512, + "num_tokens": 4225626.0, + "step": 117 + }, + { + "epoch": 0.02191272051996286, + "grad_norm": 3.883150577545166, + "learning_rate": 7.24009900990099e-08, + "loss": 0.7229, + "mean_token_accuracy": 0.7902628183364868, + "num_tokens": 4261199.0, + "step": 118 + }, + { + "epoch": 0.022098421541318476, + "grad_norm": 4.275238990783691, + "learning_rate": 7.301980198019801e-08, + "loss": 0.8002, + "mean_token_accuracy": 0.7676640748977661, + "num_tokens": 4294545.0, + "step": 119 + }, + { + "epoch": 0.022284122562674095, + "grad_norm": 4.158441543579102, + "learning_rate": 7.363861386138613e-08, + "loss": 0.7926, + "mean_token_accuracy": 0.7728716135025024, + "num_tokens": 4329504.0, + "step": 120 + }, + { + "epoch": 0.022469823584029713, + "grad_norm": 3.897000312805176, + "learning_rate": 7.425742574257424e-08, + "loss": 0.7435, + "mean_token_accuracy": 0.7851965427398682, + "num_tokens": 4371603.0, + "step": 121 + }, + { + "epoch": 0.02265552460538533, + "grad_norm": 3.5757195949554443, + "learning_rate": 7.487623762376237e-08, + "loss": 0.7069, + "mean_token_accuracy": 0.7950848340988159, + "num_tokens": 4412953.0, + "step": 122 + }, + { + "epoch": 0.022841225626740947, + "grad_norm": 3.9509994983673096, + "learning_rate": 7.54950495049505e-08, + "loss": 0.6992, + "mean_token_accuracy": 0.7952991724014282, + "num_tokens": 4446716.0, + "step": 123 + }, + { + "epoch": 0.023026926648096566, + "grad_norm": 3.740262985229492, + "learning_rate": 7.61138613861386e-08, + "loss": 0.8013, + "mean_token_accuracy": 0.7712751030921936, + "num_tokens": 4485520.0, + "step": 124 + }, + { + "epoch": 0.02321262766945218, + "grad_norm": 4.433293342590332, + "learning_rate": 7.673267326732673e-08, + "loss": 0.808, + "mean_token_accuracy": 0.7634245753288269, + "num_tokens": 4514052.0, + "step": 125 + }, + { + "epoch": 0.0233983286908078, + "grad_norm": 3.602353572845459, + "learning_rate": 7.735148514851484e-08, + "loss": 0.701, + "mean_token_accuracy": 0.7932593822479248, + "num_tokens": 4548532.0, + "step": 126 + }, + { + "epoch": 0.023584029712163418, + "grad_norm": 3.472135305404663, + "learning_rate": 7.797029702970297e-08, + "loss": 0.7151, + "mean_token_accuracy": 0.7904304265975952, + "num_tokens": 4588426.0, + "step": 127 + }, + { + "epoch": 0.023769730733519033, + "grad_norm": 3.4632863998413086, + "learning_rate": 7.858910891089109e-08, + "loss": 0.6939, + "mean_token_accuracy": 0.796696126461029, + "num_tokens": 4624915.0, + "step": 128 + }, + { + "epoch": 0.02395543175487465, + "grad_norm": 3.370288372039795, + "learning_rate": 7.92079207920792e-08, + "loss": 0.7447, + "mean_token_accuracy": 0.7852753400802612, + "num_tokens": 4665151.0, + "step": 129 + }, + { + "epoch": 0.02414113277623027, + "grad_norm": 3.4189553260803223, + "learning_rate": 7.982673267326733e-08, + "loss": 0.696, + "mean_token_accuracy": 0.7951404452323914, + "num_tokens": 4700779.0, + "step": 130 + }, + { + "epoch": 0.024326833797585885, + "grad_norm": 3.5197837352752686, + "learning_rate": 8.044554455445545e-08, + "loss": 0.7322, + "mean_token_accuracy": 0.7850398421287537, + "num_tokens": 4737860.0, + "step": 131 + }, + { + "epoch": 0.024512534818941504, + "grad_norm": 3.312953233718872, + "learning_rate": 8.106435643564356e-08, + "loss": 0.6653, + "mean_token_accuracy": 0.801532506942749, + "num_tokens": 4775635.0, + "step": 132 + }, + { + "epoch": 0.024698235840297122, + "grad_norm": 3.277859926223755, + "learning_rate": 8.168316831683169e-08, + "loss": 0.7469, + "mean_token_accuracy": 0.7844567894935608, + "num_tokens": 4817121.0, + "step": 133 + }, + { + "epoch": 0.024883936861652738, + "grad_norm": 3.3142387866973877, + "learning_rate": 8.23019801980198e-08, + "loss": 0.7247, + "mean_token_accuracy": 0.7851461172103882, + "num_tokens": 4857077.0, + "step": 134 + }, + { + "epoch": 0.025069637883008356, + "grad_norm": 3.8127846717834473, + "learning_rate": 8.292079207920792e-08, + "loss": 0.8168, + "mean_token_accuracy": 0.766022264957428, + "num_tokens": 4889951.0, + "step": 135 + }, + { + "epoch": 0.025255338904363975, + "grad_norm": 3.5271108150482178, + "learning_rate": 8.353960396039605e-08, + "loss": 0.712, + "mean_token_accuracy": 0.7942056059837341, + "num_tokens": 4924274.0, + "step": 136 + }, + { + "epoch": 0.02544103992571959, + "grad_norm": 3.2601609230041504, + "learning_rate": 8.415841584158416e-08, + "loss": 0.7362, + "mean_token_accuracy": 0.7840765118598938, + "num_tokens": 4965061.0, + "step": 137 + }, + { + "epoch": 0.02562674094707521, + "grad_norm": 3.400728464126587, + "learning_rate": 8.477722772277228e-08, + "loss": 0.7244, + "mean_token_accuracy": 0.7864826917648315, + "num_tokens": 5000850.0, + "step": 138 + }, + { + "epoch": 0.025812441968430827, + "grad_norm": 3.453608274459839, + "learning_rate": 8.53960396039604e-08, + "loss": 0.7327, + "mean_token_accuracy": 0.788850724697113, + "num_tokens": 5034969.0, + "step": 139 + }, + { + "epoch": 0.025998142989786442, + "grad_norm": 3.0404491424560547, + "learning_rate": 8.60148514851485e-08, + "loss": 0.6885, + "mean_token_accuracy": 0.7955827116966248, + "num_tokens": 5079849.0, + "step": 140 + }, + { + "epoch": 0.02618384401114206, + "grad_norm": 3.373993396759033, + "learning_rate": 8.663366336633663e-08, + "loss": 0.7587, + "mean_token_accuracy": 0.7787449359893799, + "num_tokens": 5115922.0, + "step": 141 + }, + { + "epoch": 0.02636954503249768, + "grad_norm": 3.3926949501037598, + "learning_rate": 8.725247524752474e-08, + "loss": 0.7319, + "mean_token_accuracy": 0.7850250601768494, + "num_tokens": 5151455.0, + "step": 142 + }, + { + "epoch": 0.026555246053853298, + "grad_norm": 4.189711093902588, + "learning_rate": 8.787128712871287e-08, + "loss": 0.7653, + "mean_token_accuracy": 0.7793111801147461, + "num_tokens": 5177702.0, + "step": 143 + }, + { + "epoch": 0.026740947075208913, + "grad_norm": 3.778784990310669, + "learning_rate": 8.849009900990098e-08, + "loss": 0.6833, + "mean_token_accuracy": 0.7928882837295532, + "num_tokens": 5206539.0, + "step": 144 + }, + { + "epoch": 0.02692664809656453, + "grad_norm": 3.3220372200012207, + "learning_rate": 8.91089108910891e-08, + "loss": 0.6955, + "mean_token_accuracy": 0.7950124740600586, + "num_tokens": 5240765.0, + "step": 145 + }, + { + "epoch": 0.02711234911792015, + "grad_norm": 3.3522515296936035, + "learning_rate": 8.972772277227723e-08, + "loss": 0.7198, + "mean_token_accuracy": 0.7885099649429321, + "num_tokens": 5280730.0, + "step": 146 + }, + { + "epoch": 0.027298050139275765, + "grad_norm": 3.203969717025757, + "learning_rate": 9.034653465346534e-08, + "loss": 0.6728, + "mean_token_accuracy": 0.804628849029541, + "num_tokens": 5318903.0, + "step": 147 + }, + { + "epoch": 0.027483751160631384, + "grad_norm": 3.790148973464966, + "learning_rate": 9.096534653465346e-08, + "loss": 0.767, + "mean_token_accuracy": 0.7716293334960938, + "num_tokens": 5351433.0, + "step": 148 + }, + { + "epoch": 0.027669452181987002, + "grad_norm": 3.216379165649414, + "learning_rate": 9.158415841584157e-08, + "loss": 0.6786, + "mean_token_accuracy": 0.7984383702278137, + "num_tokens": 5386056.0, + "step": 149 + }, + { + "epoch": 0.027855153203342618, + "grad_norm": 2.9892385005950928, + "learning_rate": 9.22029702970297e-08, + "loss": 0.6762, + "mean_token_accuracy": 0.798019528388977, + "num_tokens": 5427185.0, + "step": 150 + }, + { + "epoch": 0.028040854224698236, + "grad_norm": 2.9882304668426514, + "learning_rate": 9.282178217821782e-08, + "loss": 0.7424, + "mean_token_accuracy": 0.788067102432251, + "num_tokens": 5466501.0, + "step": 151 + }, + { + "epoch": 0.028226555246053855, + "grad_norm": 3.3101632595062256, + "learning_rate": 9.344059405940593e-08, + "loss": 0.7532, + "mean_token_accuracy": 0.7826772928237915, + "num_tokens": 5507207.0, + "step": 152 + }, + { + "epoch": 0.02841225626740947, + "grad_norm": 3.3001062870025635, + "learning_rate": 9.405940594059406e-08, + "loss": 0.7589, + "mean_token_accuracy": 0.7758066654205322, + "num_tokens": 5546916.0, + "step": 153 + }, + { + "epoch": 0.02859795728876509, + "grad_norm": 3.0919525623321533, + "learning_rate": 9.467821782178217e-08, + "loss": 0.6783, + "mean_token_accuracy": 0.7977262139320374, + "num_tokens": 5582493.0, + "step": 154 + }, + { + "epoch": 0.028783658310120707, + "grad_norm": 3.323167085647583, + "learning_rate": 9.52970297029703e-08, + "loss": 0.6958, + "mean_token_accuracy": 0.7939679026603699, + "num_tokens": 5619829.0, + "step": 155 + }, + { + "epoch": 0.028969359331476322, + "grad_norm": 2.8897063732147217, + "learning_rate": 9.591584158415842e-08, + "loss": 0.6828, + "mean_token_accuracy": 0.798466682434082, + "num_tokens": 5664200.0, + "step": 156 + }, + { + "epoch": 0.02915506035283194, + "grad_norm": 4.4531145095825195, + "learning_rate": 9.653465346534653e-08, + "loss": 0.694, + "mean_token_accuracy": 0.7921123504638672, + "num_tokens": 5696030.0, + "step": 157 + }, + { + "epoch": 0.02934076137418756, + "grad_norm": 3.131253480911255, + "learning_rate": 9.715346534653465e-08, + "loss": 0.7212, + "mean_token_accuracy": 0.7885173559188843, + "num_tokens": 5738666.0, + "step": 158 + }, + { + "epoch": 0.029526462395543174, + "grad_norm": 3.7974424362182617, + "learning_rate": 9.777227722772277e-08, + "loss": 0.7546, + "mean_token_accuracy": 0.7809740304946899, + "num_tokens": 5771484.0, + "step": 159 + }, + { + "epoch": 0.029712163416898793, + "grad_norm": 3.0998997688293457, + "learning_rate": 9.839108910891089e-08, + "loss": 0.7147, + "mean_token_accuracy": 0.7865748405456543, + "num_tokens": 5807393.0, + "step": 160 + }, + { + "epoch": 0.02989786443825441, + "grad_norm": 4.105050563812256, + "learning_rate": 9.900990099009901e-08, + "loss": 0.7202, + "mean_token_accuracy": 0.7888551950454712, + "num_tokens": 5845870.0, + "step": 161 + }, + { + "epoch": 0.030083565459610027, + "grad_norm": 5.058017730712891, + "learning_rate": 9.962871287128713e-08, + "loss": 0.6812, + "mean_token_accuracy": 0.7967225909233093, + "num_tokens": 5879188.0, + "step": 162 + }, + { + "epoch": 0.030269266480965645, + "grad_norm": 3.1339802742004395, + "learning_rate": 1.0024752475247525e-07, + "loss": 0.7071, + "mean_token_accuracy": 0.7857484817504883, + "num_tokens": 5913997.0, + "step": 163 + }, + { + "epoch": 0.030454967502321264, + "grad_norm": 3.9510905742645264, + "learning_rate": 1.0086633663366336e-07, + "loss": 0.7285, + "mean_token_accuracy": 0.7815824747085571, + "num_tokens": 5947360.0, + "step": 164 + }, + { + "epoch": 0.03064066852367688, + "grad_norm": 3.0481760501861572, + "learning_rate": 1.0148514851485149e-07, + "loss": 0.6447, + "mean_token_accuracy": 0.8044750690460205, + "num_tokens": 5988627.0, + "step": 165 + }, + { + "epoch": 0.030826369545032498, + "grad_norm": 4.29619836807251, + "learning_rate": 1.021039603960396e-07, + "loss": 0.6381, + "mean_token_accuracy": 0.8089412450790405, + "num_tokens": 6018913.0, + "step": 166 + }, + { + "epoch": 0.031012070566388116, + "grad_norm": 3.0971078872680664, + "learning_rate": 1.0272277227722771e-07, + "loss": 0.7047, + "mean_token_accuracy": 0.7925664782524109, + "num_tokens": 6062823.0, + "step": 167 + }, + { + "epoch": 0.03119777158774373, + "grad_norm": 2.9260711669921875, + "learning_rate": 1.0334158415841583e-07, + "loss": 0.6666, + "mean_token_accuracy": 0.8002825379371643, + "num_tokens": 6099612.0, + "step": 168 + }, + { + "epoch": 0.03138347260909935, + "grad_norm": 3.836351156234741, + "learning_rate": 1.0396039603960394e-07, + "loss": 0.6203, + "mean_token_accuracy": 0.8107013702392578, + "num_tokens": 6136602.0, + "step": 169 + }, + { + "epoch": 0.031569173630454965, + "grad_norm": 3.384732484817505, + "learning_rate": 1.0457920792079207e-07, + "loss": 0.6403, + "mean_token_accuracy": 0.8028296232223511, + "num_tokens": 6169127.0, + "step": 170 + }, + { + "epoch": 0.03175487465181059, + "grad_norm": 3.3186607360839844, + "learning_rate": 1.051980198019802e-07, + "loss": 0.7401, + "mean_token_accuracy": 0.7815067768096924, + "num_tokens": 6202801.0, + "step": 171 + }, + { + "epoch": 0.0319405756731662, + "grad_norm": 3.76200008392334, + "learning_rate": 1.058168316831683e-07, + "loss": 0.6268, + "mean_token_accuracy": 0.8098126649856567, + "num_tokens": 6238206.0, + "step": 172 + }, + { + "epoch": 0.03212627669452182, + "grad_norm": 2.8104238510131836, + "learning_rate": 1.0643564356435643e-07, + "loss": 0.6318, + "mean_token_accuracy": 0.8100336790084839, + "num_tokens": 6280760.0, + "step": 173 + }, + { + "epoch": 0.03231197771587744, + "grad_norm": 3.193516492843628, + "learning_rate": 1.0705445544554454e-07, + "loss": 0.66, + "mean_token_accuracy": 0.8020753860473633, + "num_tokens": 6319627.0, + "step": 174 + }, + { + "epoch": 0.032497678737233054, + "grad_norm": 4.606829643249512, + "learning_rate": 1.0767326732673267e-07, + "loss": 0.6762, + "mean_token_accuracy": 0.7975450754165649, + "num_tokens": 6355096.0, + "step": 175 + }, + { + "epoch": 0.03268337975858867, + "grad_norm": 3.425119161605835, + "learning_rate": 1.0829207920792079e-07, + "loss": 0.6918, + "mean_token_accuracy": 0.7933517694473267, + "num_tokens": 6394217.0, + "step": 176 + }, + { + "epoch": 0.03286908077994429, + "grad_norm": 2.8644254207611084, + "learning_rate": 1.089108910891089e-07, + "loss": 0.6365, + "mean_token_accuracy": 0.808327317237854, + "num_tokens": 6437870.0, + "step": 177 + }, + { + "epoch": 0.03305478180129991, + "grad_norm": 3.1611225605010986, + "learning_rate": 1.0952970297029703e-07, + "loss": 0.6829, + "mean_token_accuracy": 0.7958863973617554, + "num_tokens": 6475420.0, + "step": 178 + }, + { + "epoch": 0.03324048282265552, + "grad_norm": 3.9528281688690186, + "learning_rate": 1.1014851485148515e-07, + "loss": 0.7249, + "mean_token_accuracy": 0.7863665223121643, + "num_tokens": 6515395.0, + "step": 179 + }, + { + "epoch": 0.033426183844011144, + "grad_norm": 3.7211191654205322, + "learning_rate": 1.1076732673267326e-07, + "loss": 0.7422, + "mean_token_accuracy": 0.7793841361999512, + "num_tokens": 6545608.0, + "step": 180 + }, + { + "epoch": 0.03361188486536676, + "grad_norm": 3.1333160400390625, + "learning_rate": 1.1138613861386139e-07, + "loss": 0.6492, + "mean_token_accuracy": 0.8039919137954712, + "num_tokens": 6585709.0, + "step": 181 + }, + { + "epoch": 0.033797585886722374, + "grad_norm": 3.357717990875244, + "learning_rate": 1.120049504950495e-07, + "loss": 0.6494, + "mean_token_accuracy": 0.8028237819671631, + "num_tokens": 6620807.0, + "step": 182 + }, + { + "epoch": 0.033983286908077996, + "grad_norm": 4.7885026931762695, + "learning_rate": 1.1262376237623762e-07, + "loss": 0.7314, + "mean_token_accuracy": 0.7821123600006104, + "num_tokens": 6656560.0, + "step": 183 + }, + { + "epoch": 0.03416898792943361, + "grad_norm": 3.3747570514678955, + "learning_rate": 1.1324257425742575e-07, + "loss": 0.6942, + "mean_token_accuracy": 0.7928216457366943, + "num_tokens": 6690059.0, + "step": 184 + }, + { + "epoch": 0.034354688950789226, + "grad_norm": 4.470821857452393, + "learning_rate": 1.1386138613861386e-07, + "loss": 0.6706, + "mean_token_accuracy": 0.7991597056388855, + "num_tokens": 6726264.0, + "step": 185 + }, + { + "epoch": 0.03454038997214485, + "grad_norm": 3.9321608543395996, + "learning_rate": 1.1448019801980198e-07, + "loss": 0.6944, + "mean_token_accuracy": 0.7894922494888306, + "num_tokens": 6760578.0, + "step": 186 + }, + { + "epoch": 0.034726090993500464, + "grad_norm": 3.1293110847473145, + "learning_rate": 1.150990099009901e-07, + "loss": 0.6607, + "mean_token_accuracy": 0.8005179762840271, + "num_tokens": 6795179.0, + "step": 187 + }, + { + "epoch": 0.03491179201485608, + "grad_norm": 4.667661666870117, + "learning_rate": 1.1571782178217822e-07, + "loss": 0.588, + "mean_token_accuracy": 0.8204474449157715, + "num_tokens": 6827892.0, + "step": 188 + }, + { + "epoch": 0.0350974930362117, + "grad_norm": 3.543994903564453, + "learning_rate": 1.1633663366336634e-07, + "loss": 0.6239, + "mean_token_accuracy": 0.8115885853767395, + "num_tokens": 6865092.0, + "step": 189 + }, + { + "epoch": 0.035283194057567316, + "grad_norm": 3.2626845836639404, + "learning_rate": 1.1695544554455445e-07, + "loss": 0.6229, + "mean_token_accuracy": 0.8120800256729126, + "num_tokens": 6900750.0, + "step": 190 + }, + { + "epoch": 0.03546889507892293, + "grad_norm": 3.730299949645996, + "learning_rate": 1.1757425742574257e-07, + "loss": 0.6654, + "mean_token_accuracy": 0.8013834953308105, + "num_tokens": 6942364.0, + "step": 191 + }, + { + "epoch": 0.03565459610027855, + "grad_norm": 5.211744785308838, + "learning_rate": 1.1819306930693068e-07, + "loss": 0.6875, + "mean_token_accuracy": 0.7923457622528076, + "num_tokens": 6981745.0, + "step": 192 + }, + { + "epoch": 0.03584029712163417, + "grad_norm": 7.018049240112305, + "learning_rate": 1.188118811881188e-07, + "loss": 0.6836, + "mean_token_accuracy": 0.7933091521263123, + "num_tokens": 7017246.0, + "step": 193 + }, + { + "epoch": 0.03602599814298978, + "grad_norm": 5.3266754150390625, + "learning_rate": 1.194306930693069e-07, + "loss": 0.6604, + "mean_token_accuracy": 0.8051106929779053, + "num_tokens": 7052651.0, + "step": 194 + }, + { + "epoch": 0.036211699164345405, + "grad_norm": 3.4597935676574707, + "learning_rate": 1.2004950495049505e-07, + "loss": 0.6394, + "mean_token_accuracy": 0.8050005435943604, + "num_tokens": 7087424.0, + "step": 195 + }, + { + "epoch": 0.03639740018570102, + "grad_norm": 5.080013751983643, + "learning_rate": 1.2066831683168316e-07, + "loss": 0.655, + "mean_token_accuracy": 0.7988379597663879, + "num_tokens": 7121310.0, + "step": 196 + }, + { + "epoch": 0.036583101207056636, + "grad_norm": 3.694525718688965, + "learning_rate": 1.2128712871287127e-07, + "loss": 0.6209, + "mean_token_accuracy": 0.8094715476036072, + "num_tokens": 7156732.0, + "step": 197 + }, + { + "epoch": 0.03676880222841226, + "grad_norm": 3.9085710048675537, + "learning_rate": 1.2190594059405938e-07, + "loss": 0.6717, + "mean_token_accuracy": 0.7974846363067627, + "num_tokens": 7190095.0, + "step": 198 + }, + { + "epoch": 0.03695450324976787, + "grad_norm": 3.4195048809051514, + "learning_rate": 1.2252475247524752e-07, + "loss": 0.6286, + "mean_token_accuracy": 0.8085407614707947, + "num_tokens": 7223354.0, + "step": 199 + }, + { + "epoch": 0.03714020427112349, + "grad_norm": 2.9460952281951904, + "learning_rate": 1.2314356435643563e-07, + "loss": 0.648, + "mean_token_accuracy": 0.8026831746101379, + "num_tokens": 7263044.0, + "step": 200 + }, + { + "epoch": 0.03732590529247911, + "grad_norm": 4.0447096824646, + "learning_rate": 1.2376237623762375e-07, + "loss": 0.6176, + "mean_token_accuracy": 0.8123798370361328, + "num_tokens": 7297139.0, + "step": 201 + }, + { + "epoch": 0.037511606313834725, + "grad_norm": 4.0398125648498535, + "learning_rate": 1.2438118811881188e-07, + "loss": 0.6532, + "mean_token_accuracy": 0.8041608333587646, + "num_tokens": 7333237.0, + "step": 202 + }, + { + "epoch": 0.03769730733519034, + "grad_norm": 4.376633167266846, + "learning_rate": 1.25e-07, + "loss": 0.6163, + "mean_token_accuracy": 0.8161766529083252, + "num_tokens": 7369579.0, + "step": 203 + }, + { + "epoch": 0.03788300835654596, + "grad_norm": 5.155468463897705, + "learning_rate": 1.256188118811881e-07, + "loss": 0.5839, + "mean_token_accuracy": 0.820399820804596, + "num_tokens": 7402925.0, + "step": 204 + }, + { + "epoch": 0.03806870937790158, + "grad_norm": 4.026839733123779, + "learning_rate": 1.2623762376237624e-07, + "loss": 0.6619, + "mean_token_accuracy": 0.7998524308204651, + "num_tokens": 7438888.0, + "step": 205 + }, + { + "epoch": 0.0382544103992572, + "grad_norm": 4.643642902374268, + "learning_rate": 1.2685643564356435e-07, + "loss": 0.6586, + "mean_token_accuracy": 0.8012629747390747, + "num_tokens": 7476848.0, + "step": 206 + }, + { + "epoch": 0.038440111420612814, + "grad_norm": 4.906761646270752, + "learning_rate": 1.2747524752475247e-07, + "loss": 0.6611, + "mean_token_accuracy": 0.7997990846633911, + "num_tokens": 7514632.0, + "step": 207 + }, + { + "epoch": 0.03862581244196843, + "grad_norm": 4.731776714324951, + "learning_rate": 1.280940594059406e-07, + "loss": 0.6709, + "mean_token_accuracy": 0.8012107610702515, + "num_tokens": 7551822.0, + "step": 208 + }, + { + "epoch": 0.03881151346332405, + "grad_norm": 4.753050327301025, + "learning_rate": 1.2871287128712872e-07, + "loss": 0.6275, + "mean_token_accuracy": 0.8080782890319824, + "num_tokens": 7591389.0, + "step": 209 + }, + { + "epoch": 0.03899721448467967, + "grad_norm": 5.039056301116943, + "learning_rate": 1.2933168316831683e-07, + "loss": 0.6849, + "mean_token_accuracy": 0.7944158315658569, + "num_tokens": 7632948.0, + "step": 210 + }, + { + "epoch": 0.03918291550603528, + "grad_norm": 5.888190746307373, + "learning_rate": 1.2995049504950494e-07, + "loss": 0.6437, + "mean_token_accuracy": 0.8032447695732117, + "num_tokens": 7666643.0, + "step": 211 + }, + { + "epoch": 0.039368616527390904, + "grad_norm": 4.644284248352051, + "learning_rate": 1.3056930693069308e-07, + "loss": 0.6521, + "mean_token_accuracy": 0.801324725151062, + "num_tokens": 7697762.0, + "step": 212 + }, + { + "epoch": 0.03955431754874652, + "grad_norm": 5.766803741455078, + "learning_rate": 1.311881188118812e-07, + "loss": 0.6315, + "mean_token_accuracy": 0.8019748330116272, + "num_tokens": 7731314.0, + "step": 213 + }, + { + "epoch": 0.039740018570102134, + "grad_norm": 5.234568119049072, + "learning_rate": 1.318069306930693e-07, + "loss": 0.6492, + "mean_token_accuracy": 0.8017306327819824, + "num_tokens": 7765349.0, + "step": 214 + }, + { + "epoch": 0.039925719591457756, + "grad_norm": 4.565785884857178, + "learning_rate": 1.3242574257425744e-07, + "loss": 0.5836, + "mean_token_accuracy": 0.8218050003051758, + "num_tokens": 7806343.0, + "step": 215 + }, + { + "epoch": 0.04011142061281337, + "grad_norm": 3.6516730785369873, + "learning_rate": 1.3304455445544555e-07, + "loss": 0.6451, + "mean_token_accuracy": 0.8010683655738831, + "num_tokens": 7839099.0, + "step": 216 + }, + { + "epoch": 0.040297121634168986, + "grad_norm": 3.6252949237823486, + "learning_rate": 1.3366336633663366e-07, + "loss": 0.6548, + "mean_token_accuracy": 0.7981666922569275, + "num_tokens": 7876724.0, + "step": 217 + }, + { + "epoch": 0.04048282265552461, + "grad_norm": 4.594324588775635, + "learning_rate": 1.342821782178218e-07, + "loss": 0.6387, + "mean_token_accuracy": 0.8047734498977661, + "num_tokens": 7910616.0, + "step": 218 + }, + { + "epoch": 0.040668523676880224, + "grad_norm": 4.254075050354004, + "learning_rate": 1.349009900990099e-07, + "loss": 0.6658, + "mean_token_accuracy": 0.799318790435791, + "num_tokens": 7945049.0, + "step": 219 + }, + { + "epoch": 0.04085422469823584, + "grad_norm": 3.8038330078125, + "learning_rate": 1.3551980198019802e-07, + "loss": 0.6015, + "mean_token_accuracy": 0.8156794309616089, + "num_tokens": 7984477.0, + "step": 220 + }, + { + "epoch": 0.04103992571959146, + "grad_norm": 3.8200888633728027, + "learning_rate": 1.3613861386138613e-07, + "loss": 0.6101, + "mean_token_accuracy": 0.8153351545333862, + "num_tokens": 8024420.0, + "step": 221 + }, + { + "epoch": 0.041225626740947076, + "grad_norm": 4.018529415130615, + "learning_rate": 1.3675742574257427e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.8256990313529968, + "num_tokens": 8062933.0, + "step": 222 + }, + { + "epoch": 0.04141132776230269, + "grad_norm": 3.676299571990967, + "learning_rate": 1.3737623762376238e-07, + "loss": 0.6694, + "mean_token_accuracy": 0.7969991564750671, + "num_tokens": 8096338.0, + "step": 223 + }, + { + "epoch": 0.04159702878365831, + "grad_norm": 4.143718242645264, + "learning_rate": 1.379950495049505e-07, + "loss": 0.6758, + "mean_token_accuracy": 0.7931668758392334, + "num_tokens": 8134855.0, + "step": 224 + }, + { + "epoch": 0.04178272980501393, + "grad_norm": 4.888700485229492, + "learning_rate": 1.3861386138613863e-07, + "loss": 0.6337, + "mean_token_accuracy": 0.8091689348220825, + "num_tokens": 8168467.0, + "step": 225 + }, + { + "epoch": 0.04196843082636954, + "grad_norm": 4.379824161529541, + "learning_rate": 1.3923267326732674e-07, + "loss": 0.5676, + "mean_token_accuracy": 0.8296993970870972, + "num_tokens": 8212384.0, + "step": 226 + }, + { + "epoch": 0.042154131847725165, + "grad_norm": 4.201019287109375, + "learning_rate": 1.3985148514851485e-07, + "loss": 0.6242, + "mean_token_accuracy": 0.8130043148994446, + "num_tokens": 8251080.0, + "step": 227 + }, + { + "epoch": 0.04233983286908078, + "grad_norm": 4.030401706695557, + "learning_rate": 1.40470297029703e-07, + "loss": 0.6812, + "mean_token_accuracy": 0.7928903102874756, + "num_tokens": 8289623.0, + "step": 228 + }, + { + "epoch": 0.042525533890436396, + "grad_norm": 5.515501976013184, + "learning_rate": 1.4108910891089107e-07, + "loss": 0.5842, + "mean_token_accuracy": 0.8218624591827393, + "num_tokens": 8328275.0, + "step": 229 + }, + { + "epoch": 0.04271123491179202, + "grad_norm": 3.7834999561309814, + "learning_rate": 1.4170792079207919e-07, + "loss": 0.6156, + "mean_token_accuracy": 0.8116291165351868, + "num_tokens": 8369843.0, + "step": 230 + }, + { + "epoch": 0.04289693593314763, + "grad_norm": 4.324851989746094, + "learning_rate": 1.423267326732673e-07, + "loss": 0.6914, + "mean_token_accuracy": 0.7932218909263611, + "num_tokens": 8407467.0, + "step": 231 + }, + { + "epoch": 0.04308263695450325, + "grad_norm": 4.626551628112793, + "learning_rate": 1.4294554455445543e-07, + "loss": 0.6583, + "mean_token_accuracy": 0.8030170798301697, + "num_tokens": 8441524.0, + "step": 232 + }, + { + "epoch": 0.04326833797585887, + "grad_norm": 5.884190082550049, + "learning_rate": 1.4356435643564355e-07, + "loss": 0.6028, + "mean_token_accuracy": 0.8139046430587769, + "num_tokens": 8473615.0, + "step": 233 + }, + { + "epoch": 0.043454038997214485, + "grad_norm": 3.3852555751800537, + "learning_rate": 1.4418316831683166e-07, + "loss": 0.6465, + "mean_token_accuracy": 0.8052716255187988, + "num_tokens": 8516613.0, + "step": 234 + }, + { + "epoch": 0.0436397400185701, + "grad_norm": 4.068397521972656, + "learning_rate": 1.448019801980198e-07, + "loss": 0.592, + "mean_token_accuracy": 0.8196201324462891, + "num_tokens": 8556012.0, + "step": 235 + }, + { + "epoch": 0.04382544103992572, + "grad_norm": 4.026906490325928, + "learning_rate": 1.454207920792079e-07, + "loss": 0.5985, + "mean_token_accuracy": 0.8150103092193604, + "num_tokens": 8597563.0, + "step": 236 + }, + { + "epoch": 0.04401114206128134, + "grad_norm": 5.283498764038086, + "learning_rate": 1.4603960396039602e-07, + "loss": 0.5958, + "mean_token_accuracy": 0.8179425001144409, + "num_tokens": 8628982.0, + "step": 237 + }, + { + "epoch": 0.04419684308263695, + "grad_norm": 4.926267147064209, + "learning_rate": 1.4665841584158416e-07, + "loss": 0.6117, + "mean_token_accuracy": 0.8156458139419556, + "num_tokens": 8669371.0, + "step": 238 + }, + { + "epoch": 0.044382544103992574, + "grad_norm": 3.981058359146118, + "learning_rate": 1.4727722772277227e-07, + "loss": 0.5785, + "mean_token_accuracy": 0.8236161470413208, + "num_tokens": 8705508.0, + "step": 239 + }, + { + "epoch": 0.04456824512534819, + "grad_norm": 4.484579563140869, + "learning_rate": 1.4789603960396038e-07, + "loss": 0.6728, + "mean_token_accuracy": 0.7925794124603271, + "num_tokens": 8734565.0, + "step": 240 + }, + { + "epoch": 0.044753946146703805, + "grad_norm": 5.01863431930542, + "learning_rate": 1.485148514851485e-07, + "loss": 0.6493, + "mean_token_accuracy": 0.8009129166603088, + "num_tokens": 8774771.0, + "step": 241 + }, + { + "epoch": 0.04493964716805943, + "grad_norm": 4.63724422454834, + "learning_rate": 1.4913366336633663e-07, + "loss": 0.5871, + "mean_token_accuracy": 0.8173068761825562, + "num_tokens": 8807807.0, + "step": 242 + }, + { + "epoch": 0.04512534818941504, + "grad_norm": 3.653273105621338, + "learning_rate": 1.4975247524752474e-07, + "loss": 0.6577, + "mean_token_accuracy": 0.7977468371391296, + "num_tokens": 8845059.0, + "step": 243 + }, + { + "epoch": 0.04531104921077066, + "grad_norm": 4.380971908569336, + "learning_rate": 1.5037128712871285e-07, + "loss": 0.682, + "mean_token_accuracy": 0.7950884699821472, + "num_tokens": 8886249.0, + "step": 244 + }, + { + "epoch": 0.04549675023212628, + "grad_norm": 4.535455703735352, + "learning_rate": 1.50990099009901e-07, + "loss": 0.6278, + "mean_token_accuracy": 0.8084548711776733, + "num_tokens": 8922357.0, + "step": 245 + }, + { + "epoch": 0.045682451253481894, + "grad_norm": 4.324915409088135, + "learning_rate": 1.516089108910891e-07, + "loss": 0.6184, + "mean_token_accuracy": 0.812975287437439, + "num_tokens": 8959923.0, + "step": 246 + }, + { + "epoch": 0.04586815227483751, + "grad_norm": 4.422811985015869, + "learning_rate": 1.522277227722772e-07, + "loss": 0.6237, + "mean_token_accuracy": 0.807901918888092, + "num_tokens": 9000507.0, + "step": 247 + }, + { + "epoch": 0.04605385329619313, + "grad_norm": 5.348662853240967, + "learning_rate": 1.5284653465346535e-07, + "loss": 0.6898, + "mean_token_accuracy": 0.7874898910522461, + "num_tokens": 9038789.0, + "step": 248 + }, + { + "epoch": 0.046239554317548746, + "grad_norm": 5.657570838928223, + "learning_rate": 1.5346534653465346e-07, + "loss": 0.5458, + "mean_token_accuracy": 0.8317272663116455, + "num_tokens": 9075380.0, + "step": 249 + }, + { + "epoch": 0.04642525533890436, + "grad_norm": 4.2996368408203125, + "learning_rate": 1.5408415841584157e-07, + "loss": 0.6508, + "mean_token_accuracy": 0.8038384914398193, + "num_tokens": 9108048.0, + "step": 250 + }, + { + "epoch": 0.046610956360259984, + "grad_norm": 3.733651638031006, + "learning_rate": 1.5470297029702968e-07, + "loss": 0.6158, + "mean_token_accuracy": 0.8141145706176758, + "num_tokens": 9151727.0, + "step": 251 + }, + { + "epoch": 0.0467966573816156, + "grad_norm": 5.353424549102783, + "learning_rate": 1.5532178217821782e-07, + "loss": 0.6462, + "mean_token_accuracy": 0.7988307476043701, + "num_tokens": 9187660.0, + "step": 252 + }, + { + "epoch": 0.046982358402971214, + "grad_norm": 4.3972086906433105, + "learning_rate": 1.5594059405940593e-07, + "loss": 0.6106, + "mean_token_accuracy": 0.8114413619041443, + "num_tokens": 9221957.0, + "step": 253 + }, + { + "epoch": 0.047168059424326836, + "grad_norm": 3.833679437637329, + "learning_rate": 1.5655940594059404e-07, + "loss": 0.6161, + "mean_token_accuracy": 0.8107074499130249, + "num_tokens": 9249350.0, + "step": 254 + }, + { + "epoch": 0.04735376044568245, + "grad_norm": 4.738914966583252, + "learning_rate": 1.5717821782178218e-07, + "loss": 0.6414, + "mean_token_accuracy": 0.8056286573410034, + "num_tokens": 9285857.0, + "step": 255 + }, + { + "epoch": 0.047539461467038066, + "grad_norm": 4.08380126953125, + "learning_rate": 1.577970297029703e-07, + "loss": 0.6157, + "mean_token_accuracy": 0.8122696876525879, + "num_tokens": 9322981.0, + "step": 256 + }, + { + "epoch": 0.04772516248839369, + "grad_norm": 4.8156657218933105, + "learning_rate": 1.584158415841584e-07, + "loss": 0.6308, + "mean_token_accuracy": 0.806787371635437, + "num_tokens": 9357156.0, + "step": 257 + }, + { + "epoch": 0.0479108635097493, + "grad_norm": 4.937153339385986, + "learning_rate": 1.5903465346534654e-07, + "loss": 0.6612, + "mean_token_accuracy": 0.8008614182472229, + "num_tokens": 9391756.0, + "step": 258 + }, + { + "epoch": 0.04809656453110492, + "grad_norm": 4.157207489013672, + "learning_rate": 1.5965346534653465e-07, + "loss": 0.666, + "mean_token_accuracy": 0.7946168780326843, + "num_tokens": 9430327.0, + "step": 259 + }, + { + "epoch": 0.04828226555246054, + "grad_norm": 5.719115734100342, + "learning_rate": 1.6027227722772276e-07, + "loss": 0.6249, + "mean_token_accuracy": 0.8092604279518127, + "num_tokens": 9463217.0, + "step": 260 + }, + { + "epoch": 0.048467966573816156, + "grad_norm": 5.986067295074463, + "learning_rate": 1.608910891089109e-07, + "loss": 0.598, + "mean_token_accuracy": 0.8152942657470703, + "num_tokens": 9495284.0, + "step": 261 + }, + { + "epoch": 0.04865366759517177, + "grad_norm": 3.864417314529419, + "learning_rate": 1.61509900990099e-07, + "loss": 0.5687, + "mean_token_accuracy": 0.824743390083313, + "num_tokens": 9537139.0, + "step": 262 + }, + { + "epoch": 0.04883936861652739, + "grad_norm": 4.238833427429199, + "learning_rate": 1.6212871287128712e-07, + "loss": 0.6069, + "mean_token_accuracy": 0.8110176920890808, + "num_tokens": 9574582.0, + "step": 263 + }, + { + "epoch": 0.04902506963788301, + "grad_norm": 3.811584711074829, + "learning_rate": 1.6274752475247523e-07, + "loss": 0.5769, + "mean_token_accuracy": 0.8228267431259155, + "num_tokens": 9611286.0, + "step": 264 + }, + { + "epoch": 0.04921077065923862, + "grad_norm": 4.129339218139648, + "learning_rate": 1.6336633663366337e-07, + "loss": 0.5787, + "mean_token_accuracy": 0.8199238181114197, + "num_tokens": 9642641.0, + "step": 265 + }, + { + "epoch": 0.049396471680594245, + "grad_norm": 4.131208896636963, + "learning_rate": 1.6398514851485148e-07, + "loss": 0.5897, + "mean_token_accuracy": 0.8187601566314697, + "num_tokens": 9678038.0, + "step": 266 + }, + { + "epoch": 0.04958217270194986, + "grad_norm": 3.224015951156616, + "learning_rate": 1.646039603960396e-07, + "loss": 0.6332, + "mean_token_accuracy": 0.8063594102859497, + "num_tokens": 9715481.0, + "step": 267 + }, + { + "epoch": 0.049767873723305475, + "grad_norm": 4.826350688934326, + "learning_rate": 1.6522277227722773e-07, + "loss": 0.589, + "mean_token_accuracy": 0.8179001808166504, + "num_tokens": 9753444.0, + "step": 268 + }, + { + "epoch": 0.0499535747446611, + "grad_norm": 3.2880003452301025, + "learning_rate": 1.6584158415841584e-07, + "loss": 0.7364, + "mean_token_accuracy": 0.7744400501251221, + "num_tokens": 9785080.0, + "step": 269 + }, + { + "epoch": 0.05013927576601671, + "grad_norm": 3.9770689010620117, + "learning_rate": 1.6646039603960396e-07, + "loss": 0.6845, + "mean_token_accuracy": 0.7896652221679688, + "num_tokens": 9819644.0, + "step": 270 + }, + { + "epoch": 0.05032497678737233, + "grad_norm": 4.302327632904053, + "learning_rate": 1.670792079207921e-07, + "loss": 0.6303, + "mean_token_accuracy": 0.8048700094223022, + "num_tokens": 9857125.0, + "step": 271 + }, + { + "epoch": 0.05051067780872795, + "grad_norm": 3.991359233856201, + "learning_rate": 1.676980198019802e-07, + "loss": 0.6727, + "mean_token_accuracy": 0.7961969375610352, + "num_tokens": 9895577.0, + "step": 272 + }, + { + "epoch": 0.050696378830083565, + "grad_norm": 4.313619613647461, + "learning_rate": 1.6831683168316832e-07, + "loss": 0.5716, + "mean_token_accuracy": 0.819197416305542, + "num_tokens": 9930056.0, + "step": 273 + }, + { + "epoch": 0.05088207985143918, + "grad_norm": 4.265520095825195, + "learning_rate": 1.6893564356435643e-07, + "loss": 0.5859, + "mean_token_accuracy": 0.8139947652816772, + "num_tokens": 9961748.0, + "step": 274 + }, + { + "epoch": 0.0510677808727948, + "grad_norm": 3.5076189041137695, + "learning_rate": 1.6955445544554456e-07, + "loss": 0.5905, + "mean_token_accuracy": 0.8150898218154907, + "num_tokens": 9995755.0, + "step": 275 + }, + { + "epoch": 0.05125348189415042, + "grad_norm": 5.062225341796875, + "learning_rate": 1.7017326732673268e-07, + "loss": 0.6203, + "mean_token_accuracy": 0.8122956156730652, + "num_tokens": 10028169.0, + "step": 276 + }, + { + "epoch": 0.05143918291550603, + "grad_norm": 4.313332557678223, + "learning_rate": 1.707920792079208e-07, + "loss": 0.5905, + "mean_token_accuracy": 0.8094041347503662, + "num_tokens": 10065571.0, + "step": 277 + }, + { + "epoch": 0.051624883936861654, + "grad_norm": 3.7135138511657715, + "learning_rate": 1.7141089108910893e-07, + "loss": 0.6011, + "mean_token_accuracy": 0.8129053115844727, + "num_tokens": 10100277.0, + "step": 278 + }, + { + "epoch": 0.05181058495821727, + "grad_norm": 3.91821551322937, + "learning_rate": 1.72029702970297e-07, + "loss": 0.6505, + "mean_token_accuracy": 0.8039917945861816, + "num_tokens": 10136658.0, + "step": 279 + }, + { + "epoch": 0.051996285979572884, + "grad_norm": 5.216306686401367, + "learning_rate": 1.7264851485148512e-07, + "loss": 0.539, + "mean_token_accuracy": 0.8287389874458313, + "num_tokens": 10166440.0, + "step": 280 + }, + { + "epoch": 0.052181987000928506, + "grad_norm": 4.662815570831299, + "learning_rate": 1.7326732673267326e-07, + "loss": 0.5794, + "mean_token_accuracy": 0.819482147693634, + "num_tokens": 10195315.0, + "step": 281 + }, + { + "epoch": 0.05236768802228412, + "grad_norm": 3.3327889442443848, + "learning_rate": 1.7388613861386137e-07, + "loss": 0.6039, + "mean_token_accuracy": 0.8131263256072998, + "num_tokens": 10233944.0, + "step": 282 + }, + { + "epoch": 0.05255338904363974, + "grad_norm": 4.673796653747559, + "learning_rate": 1.7450495049504948e-07, + "loss": 0.555, + "mean_token_accuracy": 0.8281484842300415, + "num_tokens": 10269025.0, + "step": 283 + }, + { + "epoch": 0.05273909006499536, + "grad_norm": 4.644338607788086, + "learning_rate": 1.751237623762376e-07, + "loss": 0.5431, + "mean_token_accuracy": 0.8322693109512329, + "num_tokens": 10307636.0, + "step": 284 + }, + { + "epoch": 0.052924791086350974, + "grad_norm": 3.597602367401123, + "learning_rate": 1.7574257425742573e-07, + "loss": 0.6072, + "mean_token_accuracy": 0.8128302693367004, + "num_tokens": 10343781.0, + "step": 285 + }, + { + "epoch": 0.053110492107706596, + "grad_norm": 4.876959800720215, + "learning_rate": 1.7636138613861384e-07, + "loss": 0.6317, + "mean_token_accuracy": 0.8060397505760193, + "num_tokens": 10381506.0, + "step": 286 + }, + { + "epoch": 0.05329619312906221, + "grad_norm": 3.847252368927002, + "learning_rate": 1.7698019801980195e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.8338136672973633, + "num_tokens": 10415573.0, + "step": 287 + }, + { + "epoch": 0.053481894150417826, + "grad_norm": 3.707106351852417, + "learning_rate": 1.775990099009901e-07, + "loss": 0.616, + "mean_token_accuracy": 0.8081375956535339, + "num_tokens": 10451810.0, + "step": 288 + }, + { + "epoch": 0.05366759517177345, + "grad_norm": 3.66595458984375, + "learning_rate": 1.782178217821782e-07, + "loss": 0.565, + "mean_token_accuracy": 0.8233596682548523, + "num_tokens": 10489952.0, + "step": 289 + }, + { + "epoch": 0.05385329619312906, + "grad_norm": 3.22930645942688, + "learning_rate": 1.7883663366336631e-07, + "loss": 0.6546, + "mean_token_accuracy": 0.7998954057693481, + "num_tokens": 10527879.0, + "step": 290 + }, + { + "epoch": 0.05403899721448468, + "grad_norm": 3.2949042320251465, + "learning_rate": 1.7945544554455445e-07, + "loss": 0.5925, + "mean_token_accuracy": 0.8130749464035034, + "num_tokens": 10568189.0, + "step": 291 + }, + { + "epoch": 0.0542246982358403, + "grad_norm": 3.827343225479126, + "learning_rate": 1.8007425742574256e-07, + "loss": 0.5959, + "mean_token_accuracy": 0.8141276836395264, + "num_tokens": 10607597.0, + "step": 292 + }, + { + "epoch": 0.054410399257195916, + "grad_norm": 4.106557369232178, + "learning_rate": 1.8069306930693067e-07, + "loss": 0.628, + "mean_token_accuracy": 0.8068208694458008, + "num_tokens": 10642824.0, + "step": 293 + }, + { + "epoch": 0.05459610027855153, + "grad_norm": 4.814352512359619, + "learning_rate": 1.8131188118811879e-07, + "loss": 0.576, + "mean_token_accuracy": 0.8166836500167847, + "num_tokens": 10672618.0, + "step": 294 + }, + { + "epoch": 0.05478180129990715, + "grad_norm": 3.675309419631958, + "learning_rate": 1.8193069306930692e-07, + "loss": 0.5742, + "mean_token_accuracy": 0.8188921213150024, + "num_tokens": 10704458.0, + "step": 295 + }, + { + "epoch": 0.05496750232126277, + "grad_norm": 3.772711753845215, + "learning_rate": 1.8254950495049503e-07, + "loss": 0.6367, + "mean_token_accuracy": 0.8071712255477905, + "num_tokens": 10740960.0, + "step": 296 + }, + { + "epoch": 0.05515320334261838, + "grad_norm": 4.140458106994629, + "learning_rate": 1.8316831683168315e-07, + "loss": 0.5779, + "mean_token_accuracy": 0.8241461515426636, + "num_tokens": 10771565.0, + "step": 297 + }, + { + "epoch": 0.055338904363974005, + "grad_norm": 3.248957633972168, + "learning_rate": 1.8378712871287128e-07, + "loss": 0.5526, + "mean_token_accuracy": 0.8260301351547241, + "num_tokens": 10813153.0, + "step": 298 + }, + { + "epoch": 0.05552460538532962, + "grad_norm": 4.380034446716309, + "learning_rate": 1.844059405940594e-07, + "loss": 0.5876, + "mean_token_accuracy": 0.8224937915802002, + "num_tokens": 10842306.0, + "step": 299 + }, + { + "epoch": 0.055710306406685235, + "grad_norm": 3.855950117111206, + "learning_rate": 1.850247524752475e-07, + "loss": 0.6186, + "mean_token_accuracy": 0.8079866170883179, + "num_tokens": 10877561.0, + "step": 300 + }, + { + "epoch": 0.05589600742804086, + "grad_norm": 3.097525119781494, + "learning_rate": 1.8564356435643564e-07, + "loss": 0.6063, + "mean_token_accuracy": 0.8126264810562134, + "num_tokens": 10916917.0, + "step": 301 + }, + { + "epoch": 0.05608170844939647, + "grad_norm": 3.0066471099853516, + "learning_rate": 1.8626237623762376e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.8182780742645264, + "num_tokens": 10953542.0, + "step": 302 + }, + { + "epoch": 0.05626740947075209, + "grad_norm": 3.4566938877105713, + "learning_rate": 1.8688118811881187e-07, + "loss": 0.5367, + "mean_token_accuracy": 0.8310822248458862, + "num_tokens": 10989907.0, + "step": 303 + }, + { + "epoch": 0.05645311049210771, + "grad_norm": 4.8564300537109375, + "learning_rate": 1.875e-07, + "loss": 0.567, + "mean_token_accuracy": 0.8241435885429382, + "num_tokens": 11023276.0, + "step": 304 + }, + { + "epoch": 0.056638811513463325, + "grad_norm": 4.534656524658203, + "learning_rate": 1.8811881188118812e-07, + "loss": 0.5789, + "mean_token_accuracy": 0.8219332098960876, + "num_tokens": 11056328.0, + "step": 305 + }, + { + "epoch": 0.05682451253481894, + "grad_norm": 4.154794216156006, + "learning_rate": 1.8873762376237623e-07, + "loss": 0.6408, + "mean_token_accuracy": 0.7993330955505371, + "num_tokens": 11092810.0, + "step": 306 + }, + { + "epoch": 0.05701021355617456, + "grad_norm": 3.275716543197632, + "learning_rate": 1.8935643564356434e-07, + "loss": 0.6333, + "mean_token_accuracy": 0.8031238913536072, + "num_tokens": 11131835.0, + "step": 307 + }, + { + "epoch": 0.05719591457753018, + "grad_norm": 3.572916030883789, + "learning_rate": 1.8997524752475248e-07, + "loss": 0.5771, + "mean_token_accuracy": 0.8191787004470825, + "num_tokens": 11168134.0, + "step": 308 + }, + { + "epoch": 0.05738161559888579, + "grad_norm": 3.165692090988159, + "learning_rate": 1.905940594059406e-07, + "loss": 0.5756, + "mean_token_accuracy": 0.8190802335739136, + "num_tokens": 11206292.0, + "step": 309 + }, + { + "epoch": 0.057567316620241414, + "grad_norm": 3.1597752571105957, + "learning_rate": 1.912128712871287e-07, + "loss": 0.614, + "mean_token_accuracy": 0.8074110746383667, + "num_tokens": 11245472.0, + "step": 310 + }, + { + "epoch": 0.05775301764159703, + "grad_norm": 3.7965505123138428, + "learning_rate": 1.9183168316831684e-07, + "loss": 0.5605, + "mean_token_accuracy": 0.8220847845077515, + "num_tokens": 11274729.0, + "step": 311 + }, + { + "epoch": 0.057938718662952644, + "grad_norm": 2.976688861846924, + "learning_rate": 1.9245049504950495e-07, + "loss": 0.5689, + "mean_token_accuracy": 0.8277688026428223, + "num_tokens": 11308734.0, + "step": 312 + }, + { + "epoch": 0.058124419684308266, + "grad_norm": 3.0718908309936523, + "learning_rate": 1.9306930693069306e-07, + "loss": 0.543, + "mean_token_accuracy": 0.8290036916732788, + "num_tokens": 11344616.0, + "step": 313 + }, + { + "epoch": 0.05831012070566388, + "grad_norm": 2.8919739723205566, + "learning_rate": 1.936881188118812e-07, + "loss": 0.5953, + "mean_token_accuracy": 0.8136767148971558, + "num_tokens": 11383870.0, + "step": 314 + }, + { + "epoch": 0.0584958217270195, + "grad_norm": 3.76827335357666, + "learning_rate": 1.943069306930693e-07, + "loss": 0.6277, + "mean_token_accuracy": 0.8086541295051575, + "num_tokens": 11417115.0, + "step": 315 + }, + { + "epoch": 0.05868152274837512, + "grad_norm": 3.615659236907959, + "learning_rate": 1.9492574257425742e-07, + "loss": 0.5801, + "mean_token_accuracy": 0.8216282725334167, + "num_tokens": 11450802.0, + "step": 316 + }, + { + "epoch": 0.058867223769730734, + "grad_norm": 3.139730453491211, + "learning_rate": 1.9554455445544553e-07, + "loss": 0.6025, + "mean_token_accuracy": 0.8140665292739868, + "num_tokens": 11492677.0, + "step": 317 + }, + { + "epoch": 0.05905292479108635, + "grad_norm": 3.344223737716675, + "learning_rate": 1.9616336633663367e-07, + "loss": 0.5467, + "mean_token_accuracy": 0.8323314785957336, + "num_tokens": 11529799.0, + "step": 318 + }, + { + "epoch": 0.05923862581244197, + "grad_norm": 3.797813653945923, + "learning_rate": 1.9678217821782178e-07, + "loss": 0.6573, + "mean_token_accuracy": 0.7936639785766602, + "num_tokens": 11565835.0, + "step": 319 + }, + { + "epoch": 0.059424326833797586, + "grad_norm": 2.961275339126587, + "learning_rate": 1.974009900990099e-07, + "loss": 0.5592, + "mean_token_accuracy": 0.8241486549377441, + "num_tokens": 11602417.0, + "step": 320 + }, + { + "epoch": 0.0596100278551532, + "grad_norm": 3.207777261734009, + "learning_rate": 1.9801980198019803e-07, + "loss": 0.5707, + "mean_token_accuracy": 0.820152997970581, + "num_tokens": 11638108.0, + "step": 321 + }, + { + "epoch": 0.05979572887650882, + "grad_norm": 2.8686962127685547, + "learning_rate": 1.9863861386138614e-07, + "loss": 0.5698, + "mean_token_accuracy": 0.8220228552818298, + "num_tokens": 11678142.0, + "step": 322 + }, + { + "epoch": 0.05998142989786444, + "grad_norm": 3.1867611408233643, + "learning_rate": 1.9925742574257425e-07, + "loss": 0.534, + "mean_token_accuracy": 0.8320428133010864, + "num_tokens": 11720385.0, + "step": 323 + }, + { + "epoch": 0.06016713091922005, + "grad_norm": 3.473145008087158, + "learning_rate": 1.998762376237624e-07, + "loss": 0.555, + "mean_token_accuracy": 0.8286916613578796, + "num_tokens": 11756971.0, + "step": 324 + }, + { + "epoch": 0.060352831940575676, + "grad_norm": 3.630554437637329, + "learning_rate": 2.004950495049505e-07, + "loss": 0.6044, + "mean_token_accuracy": 0.8075124025344849, + "num_tokens": 11788993.0, + "step": 325 + }, + { + "epoch": 0.06053853296193129, + "grad_norm": 3.3843421936035156, + "learning_rate": 2.011138613861386e-07, + "loss": 0.5528, + "mean_token_accuracy": 0.8228915929794312, + "num_tokens": 11834213.0, + "step": 326 + }, + { + "epoch": 0.060724233983286906, + "grad_norm": 3.3898634910583496, + "learning_rate": 2.0173267326732672e-07, + "loss": 0.5419, + "mean_token_accuracy": 0.8302251100540161, + "num_tokens": 11873138.0, + "step": 327 + }, + { + "epoch": 0.06090993500464253, + "grad_norm": 3.074596643447876, + "learning_rate": 2.0235148514851486e-07, + "loss": 0.5655, + "mean_token_accuracy": 0.8240243792533875, + "num_tokens": 11911385.0, + "step": 328 + }, + { + "epoch": 0.06109563602599814, + "grad_norm": 3.3907175064086914, + "learning_rate": 2.0297029702970297e-07, + "loss": 0.6229, + "mean_token_accuracy": 0.8109442591667175, + "num_tokens": 11946016.0, + "step": 329 + }, + { + "epoch": 0.06128133704735376, + "grad_norm": 3.212580919265747, + "learning_rate": 2.0358910891089106e-07, + "loss": 0.5681, + "mean_token_accuracy": 0.820242166519165, + "num_tokens": 11982353.0, + "step": 330 + }, + { + "epoch": 0.06146703806870938, + "grad_norm": 3.5039734840393066, + "learning_rate": 2.042079207920792e-07, + "loss": 0.5792, + "mean_token_accuracy": 0.8236408829689026, + "num_tokens": 12021271.0, + "step": 331 + }, + { + "epoch": 0.061652739090064995, + "grad_norm": 3.0500648021698, + "learning_rate": 2.048267326732673e-07, + "loss": 0.5407, + "mean_token_accuracy": 0.8318171501159668, + "num_tokens": 12053903.0, + "step": 332 + }, + { + "epoch": 0.06183844011142061, + "grad_norm": 3.1186366081237793, + "learning_rate": 2.0544554455445542e-07, + "loss": 0.547, + "mean_token_accuracy": 0.8295854330062866, + "num_tokens": 12093642.0, + "step": 333 + }, + { + "epoch": 0.06202414113277623, + "grad_norm": 3.0169219970703125, + "learning_rate": 2.0606435643564356e-07, + "loss": 0.5808, + "mean_token_accuracy": 0.8199893832206726, + "num_tokens": 12131327.0, + "step": 334 + }, + { + "epoch": 0.06220984215413185, + "grad_norm": 3.144493579864502, + "learning_rate": 2.0668316831683167e-07, + "loss": 0.5988, + "mean_token_accuracy": 0.8126430511474609, + "num_tokens": 12164293.0, + "step": 335 + }, + { + "epoch": 0.06239554317548746, + "grad_norm": 3.4806666374206543, + "learning_rate": 2.0730198019801978e-07, + "loss": 0.5951, + "mean_token_accuracy": 0.8162698745727539, + "num_tokens": 12198646.0, + "step": 336 + }, + { + "epoch": 0.06258124419684308, + "grad_norm": 2.5653512477874756, + "learning_rate": 2.079207920792079e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8389807343482971, + "num_tokens": 12233743.0, + "step": 337 + }, + { + "epoch": 0.0627669452181987, + "grad_norm": 3.6432063579559326, + "learning_rate": 2.0853960396039603e-07, + "loss": 0.5809, + "mean_token_accuracy": 0.8152008056640625, + "num_tokens": 12262286.0, + "step": 338 + }, + { + "epoch": 0.06295264623955432, + "grad_norm": 2.9257214069366455, + "learning_rate": 2.0915841584158414e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.8405559659004211, + "num_tokens": 12304791.0, + "step": 339 + }, + { + "epoch": 0.06313834726090993, + "grad_norm": 3.8558239936828613, + "learning_rate": 2.0977722772277225e-07, + "loss": 0.5808, + "mean_token_accuracy": 0.8165379166603088, + "num_tokens": 12337354.0, + "step": 340 + }, + { + "epoch": 0.06332404828226555, + "grad_norm": 3.686204671859741, + "learning_rate": 2.103960396039604e-07, + "loss": 0.6383, + "mean_token_accuracy": 0.8023837804794312, + "num_tokens": 12373437.0, + "step": 341 + }, + { + "epoch": 0.06350974930362117, + "grad_norm": 3.186763048171997, + "learning_rate": 2.110148514851485e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.8318014144897461, + "num_tokens": 12409259.0, + "step": 342 + }, + { + "epoch": 0.06369545032497678, + "grad_norm": 2.9186148643493652, + "learning_rate": 2.116336633663366e-07, + "loss": 0.5243, + "mean_token_accuracy": 0.8364964723587036, + "num_tokens": 12447985.0, + "step": 343 + }, + { + "epoch": 0.0638811513463324, + "grad_norm": 2.7135848999023438, + "learning_rate": 2.1225247524752475e-07, + "loss": 0.5861, + "mean_token_accuracy": 0.8201305866241455, + "num_tokens": 12487142.0, + "step": 344 + }, + { + "epoch": 0.06406685236768803, + "grad_norm": 3.6002655029296875, + "learning_rate": 2.1287128712871286e-07, + "loss": 0.5512, + "mean_token_accuracy": 0.8240790367126465, + "num_tokens": 12518307.0, + "step": 345 + }, + { + "epoch": 0.06425255338904363, + "grad_norm": 3.544088125228882, + "learning_rate": 2.1349009900990097e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.8366739153862, + "num_tokens": 12549539.0, + "step": 346 + }, + { + "epoch": 0.06443825441039926, + "grad_norm": 2.891883611679077, + "learning_rate": 2.1410891089108908e-07, + "loss": 0.6268, + "mean_token_accuracy": 0.8038026094436646, + "num_tokens": 12588109.0, + "step": 347 + }, + { + "epoch": 0.06462395543175488, + "grad_norm": 3.286332368850708, + "learning_rate": 2.1472772277227722e-07, + "loss": 0.5938, + "mean_token_accuracy": 0.8134980201721191, + "num_tokens": 12620474.0, + "step": 348 + }, + { + "epoch": 0.06480965645311049, + "grad_norm": 3.2350873947143555, + "learning_rate": 2.1534653465346533e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8229050636291504, + "num_tokens": 12658013.0, + "step": 349 + }, + { + "epoch": 0.06499535747446611, + "grad_norm": 3.334758758544922, + "learning_rate": 2.1596534653465344e-07, + "loss": 0.5681, + "mean_token_accuracy": 0.8268067240715027, + "num_tokens": 12695299.0, + "step": 350 + }, + { + "epoch": 0.06518105849582173, + "grad_norm": 2.5212950706481934, + "learning_rate": 2.1658415841584158e-07, + "loss": 0.575, + "mean_token_accuracy": 0.8225003480911255, + "num_tokens": 12738106.0, + "step": 351 + }, + { + "epoch": 0.06536675951717734, + "grad_norm": 3.171415090560913, + "learning_rate": 2.172029702970297e-07, + "loss": 0.503, + "mean_token_accuracy": 0.839049220085144, + "num_tokens": 12775358.0, + "step": 352 + }, + { + "epoch": 0.06555246053853296, + "grad_norm": 2.960665225982666, + "learning_rate": 2.178217821782178e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.8187280297279358, + "num_tokens": 12815485.0, + "step": 353 + }, + { + "epoch": 0.06573816155988858, + "grad_norm": 3.244230031967163, + "learning_rate": 2.1844059405940594e-07, + "loss": 0.5759, + "mean_token_accuracy": 0.8200933933258057, + "num_tokens": 12849608.0, + "step": 354 + }, + { + "epoch": 0.06592386258124419, + "grad_norm": 3.1931047439575195, + "learning_rate": 2.1905940594059405e-07, + "loss": 0.597, + "mean_token_accuracy": 0.8109370470046997, + "num_tokens": 12882031.0, + "step": 355 + }, + { + "epoch": 0.06610956360259981, + "grad_norm": 2.9190542697906494, + "learning_rate": 2.1967821782178216e-07, + "loss": 0.5738, + "mean_token_accuracy": 0.8216521739959717, + "num_tokens": 12919104.0, + "step": 356 + }, + { + "epoch": 0.06629526462395544, + "grad_norm": 3.130632162094116, + "learning_rate": 2.202970297029703e-07, + "loss": 0.5423, + "mean_token_accuracy": 0.8296087980270386, + "num_tokens": 12956254.0, + "step": 357 + }, + { + "epoch": 0.06648096564531104, + "grad_norm": 3.543919324874878, + "learning_rate": 2.209158415841584e-07, + "loss": 0.5831, + "mean_token_accuracy": 0.8181090354919434, + "num_tokens": 12991685.0, + "step": 358 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 2.61385178565979, + "learning_rate": 2.2153465346534652e-07, + "loss": 0.5393, + "mean_token_accuracy": 0.8323763608932495, + "num_tokens": 13031631.0, + "step": 359 + }, + { + "epoch": 0.06685236768802229, + "grad_norm": 2.7663660049438477, + "learning_rate": 2.2215346534653464e-07, + "loss": 0.5296, + "mean_token_accuracy": 0.8342483043670654, + "num_tokens": 13066605.0, + "step": 360 + }, + { + "epoch": 0.0670380687093779, + "grad_norm": 2.90507435798645, + "learning_rate": 2.2277227722772277e-07, + "loss": 0.5742, + "mean_token_accuracy": 0.8214617967605591, + "num_tokens": 13103737.0, + "step": 361 + }, + { + "epoch": 0.06722376973073352, + "grad_norm": 2.871649980545044, + "learning_rate": 2.2339108910891088e-07, + "loss": 0.5585, + "mean_token_accuracy": 0.8233360648155212, + "num_tokens": 13137841.0, + "step": 362 + }, + { + "epoch": 0.06740947075208914, + "grad_norm": 2.6913833618164062, + "learning_rate": 2.24009900990099e-07, + "loss": 0.5541, + "mean_token_accuracy": 0.823672354221344, + "num_tokens": 13173615.0, + "step": 363 + }, + { + "epoch": 0.06759517177344475, + "grad_norm": 2.516261339187622, + "learning_rate": 2.2462871287128713e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.8179114460945129, + "num_tokens": 13208600.0, + "step": 364 + }, + { + "epoch": 0.06778087279480037, + "grad_norm": 3.0259952545166016, + "learning_rate": 2.2524752475247524e-07, + "loss": 0.5517, + "mean_token_accuracy": 0.8259312510490417, + "num_tokens": 13245353.0, + "step": 365 + }, + { + "epoch": 0.06796657381615599, + "grad_norm": 2.619779109954834, + "learning_rate": 2.2586633663366336e-07, + "loss": 0.5706, + "mean_token_accuracy": 0.8209314942359924, + "num_tokens": 13287078.0, + "step": 366 + }, + { + "epoch": 0.0681522748375116, + "grad_norm": 2.592639923095703, + "learning_rate": 2.264851485148515e-07, + "loss": 0.5284, + "mean_token_accuracy": 0.8322266340255737, + "num_tokens": 13321625.0, + "step": 367 + }, + { + "epoch": 0.06833797585886722, + "grad_norm": 3.000199317932129, + "learning_rate": 2.271039603960396e-07, + "loss": 0.5445, + "mean_token_accuracy": 0.8263395428657532, + "num_tokens": 13362657.0, + "step": 368 + }, + { + "epoch": 0.06852367688022284, + "grad_norm": 2.3832168579101562, + "learning_rate": 2.2772277227722772e-07, + "loss": 0.5957, + "mean_token_accuracy": 0.8114835619926453, + "num_tokens": 13403815.0, + "step": 369 + }, + { + "epoch": 0.06870937790157845, + "grad_norm": 2.7224268913269043, + "learning_rate": 2.2834158415841583e-07, + "loss": 0.5297, + "mean_token_accuracy": 0.8286790251731873, + "num_tokens": 13436556.0, + "step": 370 + }, + { + "epoch": 0.06889507892293407, + "grad_norm": 2.3732779026031494, + "learning_rate": 2.2896039603960397e-07, + "loss": 0.5533, + "mean_token_accuracy": 0.8259175419807434, + "num_tokens": 13477733.0, + "step": 371 + }, + { + "epoch": 0.0690807799442897, + "grad_norm": 2.435241222381592, + "learning_rate": 2.2957920792079208e-07, + "loss": 0.5599, + "mean_token_accuracy": 0.821792721748352, + "num_tokens": 13515455.0, + "step": 372 + }, + { + "epoch": 0.0692664809656453, + "grad_norm": 2.647731065750122, + "learning_rate": 2.301980198019802e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8216710686683655, + "num_tokens": 13551151.0, + "step": 373 + }, + { + "epoch": 0.06945218198700093, + "grad_norm": 3.3863065242767334, + "learning_rate": 2.3081683168316833e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.834233283996582, + "num_tokens": 13583258.0, + "step": 374 + }, + { + "epoch": 0.06963788300835655, + "grad_norm": 2.6982357501983643, + "learning_rate": 2.3143564356435644e-07, + "loss": 0.5213, + "mean_token_accuracy": 0.8357853889465332, + "num_tokens": 13616570.0, + "step": 375 + }, + { + "epoch": 0.06982358402971216, + "grad_norm": 2.9804744720458984, + "learning_rate": 2.3205445544554455e-07, + "loss": 0.5653, + "mean_token_accuracy": 0.8239635825157166, + "num_tokens": 13662475.0, + "step": 376 + }, + { + "epoch": 0.07000928505106778, + "grad_norm": 2.95879864692688, + "learning_rate": 2.3267326732673269e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.8241099119186401, + "num_tokens": 13701748.0, + "step": 377 + }, + { + "epoch": 0.0701949860724234, + "grad_norm": 3.3251819610595703, + "learning_rate": 2.332920792079208e-07, + "loss": 0.5811, + "mean_token_accuracy": 0.8226674199104309, + "num_tokens": 13743342.0, + "step": 378 + }, + { + "epoch": 0.07038068709377901, + "grad_norm": 2.718764305114746, + "learning_rate": 2.339108910891089e-07, + "loss": 0.5752, + "mean_token_accuracy": 0.8222883939743042, + "num_tokens": 13777530.0, + "step": 379 + }, + { + "epoch": 0.07056638811513463, + "grad_norm": 2.9886293411254883, + "learning_rate": 2.34529702970297e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8336893320083618, + "num_tokens": 13814294.0, + "step": 380 + }, + { + "epoch": 0.07075208913649025, + "grad_norm": 2.5827550888061523, + "learning_rate": 2.3514851485148513e-07, + "loss": 0.6031, + "mean_token_accuracy": 0.8145753741264343, + "num_tokens": 13851331.0, + "step": 381 + }, + { + "epoch": 0.07093779015784586, + "grad_norm": 2.7075867652893066, + "learning_rate": 2.3576732673267324e-07, + "loss": 0.5318, + "mean_token_accuracy": 0.8294758200645447, + "num_tokens": 13887954.0, + "step": 382 + }, + { + "epoch": 0.07112349117920148, + "grad_norm": 2.4939475059509277, + "learning_rate": 2.3638613861386135e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8298614025115967, + "num_tokens": 13922290.0, + "step": 383 + }, + { + "epoch": 0.0713091922005571, + "grad_norm": 2.603877067565918, + "learning_rate": 2.370049504950495e-07, + "loss": 0.5612, + "mean_token_accuracy": 0.819438099861145, + "num_tokens": 13955238.0, + "step": 384 + }, + { + "epoch": 0.07149489322191271, + "grad_norm": 2.5779106616973877, + "learning_rate": 2.376237623762376e-07, + "loss": 0.5827, + "mean_token_accuracy": 0.8196274638175964, + "num_tokens": 13988575.0, + "step": 385 + }, + { + "epoch": 0.07168059424326834, + "grad_norm": 2.231868028640747, + "learning_rate": 2.3824257425742571e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8257609009742737, + "num_tokens": 14026191.0, + "step": 386 + }, + { + "epoch": 0.07186629526462396, + "grad_norm": 2.7343788146972656, + "learning_rate": 2.388613861386138e-07, + "loss": 0.5274, + "mean_token_accuracy": 0.8313305974006653, + "num_tokens": 14061172.0, + "step": 387 + }, + { + "epoch": 0.07205199628597957, + "grad_norm": 2.478774070739746, + "learning_rate": 2.3948019801980194e-07, + "loss": 0.6356, + "mean_token_accuracy": 0.8038530349731445, + "num_tokens": 14096406.0, + "step": 388 + }, + { + "epoch": 0.07223769730733519, + "grad_norm": 2.5295732021331787, + "learning_rate": 2.400990099009901e-07, + "loss": 0.5341, + "mean_token_accuracy": 0.8274471163749695, + "num_tokens": 14133990.0, + "step": 389 + }, + { + "epoch": 0.07242339832869081, + "grad_norm": 2.557630777359009, + "learning_rate": 2.407178217821782e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.8428009748458862, + "num_tokens": 14169075.0, + "step": 390 + }, + { + "epoch": 0.07260909935004642, + "grad_norm": 2.2986958026885986, + "learning_rate": 2.413366336633663e-07, + "loss": 0.5672, + "mean_token_accuracy": 0.8230427503585815, + "num_tokens": 14207091.0, + "step": 391 + }, + { + "epoch": 0.07279480037140204, + "grad_norm": 2.4929146766662598, + "learning_rate": 2.4195544554455444e-07, + "loss": 0.543, + "mean_token_accuracy": 0.8267040252685547, + "num_tokens": 14245726.0, + "step": 392 + }, + { + "epoch": 0.07298050139275766, + "grad_norm": 2.447085380554199, + "learning_rate": 2.4257425742574255e-07, + "loss": 0.5506, + "mean_token_accuracy": 0.8272372484207153, + "num_tokens": 14283210.0, + "step": 393 + }, + { + "epoch": 0.07316620241411327, + "grad_norm": 2.3781516551971436, + "learning_rate": 2.4319306930693066e-07, + "loss": 0.5643, + "mean_token_accuracy": 0.826063871383667, + "num_tokens": 14323936.0, + "step": 394 + }, + { + "epoch": 0.0733519034354689, + "grad_norm": 2.187131881713867, + "learning_rate": 2.4381188118811877e-07, + "loss": 0.5619, + "mean_token_accuracy": 0.8234604597091675, + "num_tokens": 14364090.0, + "step": 395 + }, + { + "epoch": 0.07353760445682452, + "grad_norm": 2.487359046936035, + "learning_rate": 2.4443069306930693e-07, + "loss": 0.5437, + "mean_token_accuracy": 0.8274933099746704, + "num_tokens": 14396257.0, + "step": 396 + }, + { + "epoch": 0.07372330547818012, + "grad_norm": 2.352921962738037, + "learning_rate": 2.4504950495049505e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8220890760421753, + "num_tokens": 14438961.0, + "step": 397 + }, + { + "epoch": 0.07390900649953575, + "grad_norm": 2.246908664703369, + "learning_rate": 2.4566831683168316e-07, + "loss": 0.5496, + "mean_token_accuracy": 0.8310750126838684, + "num_tokens": 14479038.0, + "step": 398 + }, + { + "epoch": 0.07409470752089137, + "grad_norm": 2.548473596572876, + "learning_rate": 2.4628712871287127e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.8151435852050781, + "num_tokens": 14515049.0, + "step": 399 + }, + { + "epoch": 0.07428040854224698, + "grad_norm": 3.1316590309143066, + "learning_rate": 2.469059405940594e-07, + "loss": 0.597, + "mean_token_accuracy": 0.8112139701843262, + "num_tokens": 14549231.0, + "step": 400 + }, + { + "epoch": 0.0744661095636026, + "grad_norm": 2.22037672996521, + "learning_rate": 2.475247524752475e-07, + "loss": 0.5744, + "mean_token_accuracy": 0.8207686543464661, + "num_tokens": 14587623.0, + "step": 401 + }, + { + "epoch": 0.07465181058495822, + "grad_norm": 3.0994954109191895, + "learning_rate": 2.4814356435643565e-07, + "loss": 0.5472, + "mean_token_accuracy": 0.8266162872314453, + "num_tokens": 14614746.0, + "step": 402 + }, + { + "epoch": 0.07483751160631383, + "grad_norm": 3.125133752822876, + "learning_rate": 2.4876237623762377e-07, + "loss": 0.534, + "mean_token_accuracy": 0.8287583589553833, + "num_tokens": 14647388.0, + "step": 403 + }, + { + "epoch": 0.07502321262766945, + "grad_norm": 2.605391263961792, + "learning_rate": 2.493811881188119e-07, + "loss": 0.5841, + "mean_token_accuracy": 0.8192434310913086, + "num_tokens": 14683827.0, + "step": 404 + }, + { + "epoch": 0.07520891364902507, + "grad_norm": 2.225966215133667, + "learning_rate": 2.5e-07, + "loss": 0.5192, + "mean_token_accuracy": 0.8323058485984802, + "num_tokens": 14722488.0, + "step": 405 + }, + { + "epoch": 0.07539461467038068, + "grad_norm": 2.280167579650879, + "learning_rate": 2.506188118811881e-07, + "loss": 0.5287, + "mean_token_accuracy": 0.8328673243522644, + "num_tokens": 14760394.0, + "step": 406 + }, + { + "epoch": 0.0755803156917363, + "grad_norm": 2.2630608081817627, + "learning_rate": 2.512376237623762e-07, + "loss": 0.5505, + "mean_token_accuracy": 0.8238289952278137, + "num_tokens": 14800049.0, + "step": 407 + }, + { + "epoch": 0.07576601671309192, + "grad_norm": 2.6195859909057617, + "learning_rate": 2.518564356435643e-07, + "loss": 0.5479, + "mean_token_accuracy": 0.8273704648017883, + "num_tokens": 14835371.0, + "step": 408 + }, + { + "epoch": 0.07595171773444755, + "grad_norm": 2.54524564743042, + "learning_rate": 2.524752475247525e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8357520699501038, + "num_tokens": 14873280.0, + "step": 409 + }, + { + "epoch": 0.07613741875580315, + "grad_norm": 2.8508048057556152, + "learning_rate": 2.530940594059406e-07, + "loss": 0.5353, + "mean_token_accuracy": 0.8291577100753784, + "num_tokens": 14901406.0, + "step": 410 + }, + { + "epoch": 0.07632311977715878, + "grad_norm": 2.2482314109802246, + "learning_rate": 2.537128712871287e-07, + "loss": 0.5151, + "mean_token_accuracy": 0.8342801332473755, + "num_tokens": 14939734.0, + "step": 411 + }, + { + "epoch": 0.0765088207985144, + "grad_norm": 2.0405890941619873, + "learning_rate": 2.543316831683168e-07, + "loss": 0.5257, + "mean_token_accuracy": 0.835532546043396, + "num_tokens": 14978628.0, + "step": 412 + }, + { + "epoch": 0.07669452181987, + "grad_norm": 2.084982395172119, + "learning_rate": 2.5495049504950493e-07, + "loss": 0.5493, + "mean_token_accuracy": 0.8251481056213379, + "num_tokens": 15022323.0, + "step": 413 + }, + { + "epoch": 0.07688022284122563, + "grad_norm": 2.366521120071411, + "learning_rate": 2.5556930693069304e-07, + "loss": 0.5306, + "mean_token_accuracy": 0.8301851153373718, + "num_tokens": 15055714.0, + "step": 414 + }, + { + "epoch": 0.07706592386258125, + "grad_norm": 2.116192579269409, + "learning_rate": 2.561881188118812e-07, + "loss": 0.5444, + "mean_token_accuracy": 0.8269561529159546, + "num_tokens": 15092921.0, + "step": 415 + }, + { + "epoch": 0.07725162488393686, + "grad_norm": 2.422081232070923, + "learning_rate": 2.568069306930693e-07, + "loss": 0.6289, + "mean_token_accuracy": 0.8054243326187134, + "num_tokens": 15128249.0, + "step": 416 + }, + { + "epoch": 0.07743732590529248, + "grad_norm": 2.1946442127227783, + "learning_rate": 2.5742574257425743e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8478230237960815, + "num_tokens": 15164598.0, + "step": 417 + }, + { + "epoch": 0.0776230269266481, + "grad_norm": 2.131683111190796, + "learning_rate": 2.5804455445544554e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8484119176864624, + "num_tokens": 15198241.0, + "step": 418 + }, + { + "epoch": 0.07780872794800371, + "grad_norm": 2.0876691341400146, + "learning_rate": 2.5866336633663365e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8421915173530579, + "num_tokens": 15236008.0, + "step": 419 + }, + { + "epoch": 0.07799442896935933, + "grad_norm": 2.4048874378204346, + "learning_rate": 2.5928217821782176e-07, + "loss": 0.5517, + "mean_token_accuracy": 0.8252980709075928, + "num_tokens": 15273056.0, + "step": 420 + }, + { + "epoch": 0.07818012999071496, + "grad_norm": 2.3657281398773193, + "learning_rate": 2.599009900990099e-07, + "loss": 0.5994, + "mean_token_accuracy": 0.811217725276947, + "num_tokens": 15310510.0, + "step": 421 + }, + { + "epoch": 0.07836583101207056, + "grad_norm": 2.3261613845825195, + "learning_rate": 2.6051980198019804e-07, + "loss": 0.5775, + "mean_token_accuracy": 0.8164739012718201, + "num_tokens": 15346572.0, + "step": 422 + }, + { + "epoch": 0.07855153203342619, + "grad_norm": 2.0465705394744873, + "learning_rate": 2.6113861386138615e-07, + "loss": 0.5407, + "mean_token_accuracy": 0.8303365111351013, + "num_tokens": 15383346.0, + "step": 423 + }, + { + "epoch": 0.07873723305478181, + "grad_norm": 2.2450039386749268, + "learning_rate": 2.6175742574257426e-07, + "loss": 0.5391, + "mean_token_accuracy": 0.8256102204322815, + "num_tokens": 15417861.0, + "step": 424 + }, + { + "epoch": 0.07892293407613742, + "grad_norm": 2.2196004390716553, + "learning_rate": 2.623762376237624e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8377252817153931, + "num_tokens": 15460202.0, + "step": 425 + }, + { + "epoch": 0.07910863509749304, + "grad_norm": 1.8675222396850586, + "learning_rate": 2.629950495049505e-07, + "loss": 0.5672, + "mean_token_accuracy": 0.8185715675354004, + "num_tokens": 15502498.0, + "step": 426 + }, + { + "epoch": 0.07929433611884866, + "grad_norm": 2.0050816535949707, + "learning_rate": 2.636138613861386e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8431457281112671, + "num_tokens": 15537329.0, + "step": 427 + }, + { + "epoch": 0.07948003714020427, + "grad_norm": 1.860457420349121, + "learning_rate": 2.642326732673267e-07, + "loss": 0.5354, + "mean_token_accuracy": 0.8277373313903809, + "num_tokens": 15579572.0, + "step": 428 + }, + { + "epoch": 0.07966573816155989, + "grad_norm": 2.211702346801758, + "learning_rate": 2.6485148514851487e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.8386062979698181, + "num_tokens": 15614030.0, + "step": 429 + }, + { + "epoch": 0.07985143918291551, + "grad_norm": 2.0603630542755127, + "learning_rate": 2.65470297029703e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.836135983467102, + "num_tokens": 15656895.0, + "step": 430 + }, + { + "epoch": 0.08003714020427112, + "grad_norm": 2.018232583999634, + "learning_rate": 2.660891089108911e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8228162527084351, + "num_tokens": 15697402.0, + "step": 431 + }, + { + "epoch": 0.08022284122562674, + "grad_norm": 2.160501480102539, + "learning_rate": 2.667079207920792e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8378465175628662, + "num_tokens": 15730573.0, + "step": 432 + }, + { + "epoch": 0.08040854224698236, + "grad_norm": 2.1136674880981445, + "learning_rate": 2.673267326732673e-07, + "loss": 0.5264, + "mean_token_accuracy": 0.8309373259544373, + "num_tokens": 15766987.0, + "step": 433 + }, + { + "epoch": 0.08059424326833797, + "grad_norm": 1.9611692428588867, + "learning_rate": 2.6794554455445543e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.8348652124404907, + "num_tokens": 15805159.0, + "step": 434 + }, + { + "epoch": 0.0807799442896936, + "grad_norm": 2.0820300579071045, + "learning_rate": 2.685643564356436e-07, + "loss": 0.5207, + "mean_token_accuracy": 0.8371596336364746, + "num_tokens": 15841070.0, + "step": 435 + }, + { + "epoch": 0.08096564531104922, + "grad_norm": 2.181576728820801, + "learning_rate": 2.691831683168317e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8412570357322693, + "num_tokens": 15876407.0, + "step": 436 + }, + { + "epoch": 0.08115134633240483, + "grad_norm": 1.9145612716674805, + "learning_rate": 2.698019801980198e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8501585125923157, + "num_tokens": 15916484.0, + "step": 437 + }, + { + "epoch": 0.08133704735376045, + "grad_norm": 1.919403314590454, + "learning_rate": 2.7042079207920793e-07, + "loss": 0.5685, + "mean_token_accuracy": 0.8203394412994385, + "num_tokens": 15956780.0, + "step": 438 + }, + { + "epoch": 0.08152274837511607, + "grad_norm": 2.0434701442718506, + "learning_rate": 2.7103960396039604e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8392624855041504, + "num_tokens": 15990577.0, + "step": 439 + }, + { + "epoch": 0.08170844939647168, + "grad_norm": 2.3838412761688232, + "learning_rate": 2.7165841584158415e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.8236123323440552, + "num_tokens": 16027994.0, + "step": 440 + }, + { + "epoch": 0.0818941504178273, + "grad_norm": 2.357264995574951, + "learning_rate": 2.7227722772277226e-07, + "loss": 0.5265, + "mean_token_accuracy": 0.8261154890060425, + "num_tokens": 16061737.0, + "step": 441 + }, + { + "epoch": 0.08207985143918292, + "grad_norm": 2.389533042907715, + "learning_rate": 2.728960396039604e-07, + "loss": 0.5608, + "mean_token_accuracy": 0.8184967041015625, + "num_tokens": 16095082.0, + "step": 442 + }, + { + "epoch": 0.08226555246053853, + "grad_norm": 2.0769810676574707, + "learning_rate": 2.7351485148514854e-07, + "loss": 0.5635, + "mean_token_accuracy": 0.8190822601318359, + "num_tokens": 16128996.0, + "step": 443 + }, + { + "epoch": 0.08245125348189415, + "grad_norm": 1.764892339706421, + "learning_rate": 2.7413366336633665e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.8531970977783203, + "num_tokens": 16170676.0, + "step": 444 + }, + { + "epoch": 0.08263695450324977, + "grad_norm": 2.1498613357543945, + "learning_rate": 2.7475247524752476e-07, + "loss": 0.5588, + "mean_token_accuracy": 0.8250960111618042, + "num_tokens": 16209650.0, + "step": 445 + }, + { + "epoch": 0.08282265552460538, + "grad_norm": 2.086606740951538, + "learning_rate": 2.7537128712871287e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8431580066680908, + "num_tokens": 16242037.0, + "step": 446 + }, + { + "epoch": 0.083008356545961, + "grad_norm": 1.7028614282608032, + "learning_rate": 2.75990099009901e-07, + "loss": 0.5146, + "mean_token_accuracy": 0.8336703777313232, + "num_tokens": 16284786.0, + "step": 447 + }, + { + "epoch": 0.08319405756731663, + "grad_norm": 1.8615938425064087, + "learning_rate": 2.7660891089108915e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8396470546722412, + "num_tokens": 16320856.0, + "step": 448 + }, + { + "epoch": 0.08337975858867223, + "grad_norm": 2.2526845932006836, + "learning_rate": 2.7722772277227726e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8432881832122803, + "num_tokens": 16353483.0, + "step": 449 + }, + { + "epoch": 0.08356545961002786, + "grad_norm": 2.089879035949707, + "learning_rate": 2.7784653465346537e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.839098334312439, + "num_tokens": 16388929.0, + "step": 450 + }, + { + "epoch": 0.08375116063138348, + "grad_norm": 1.8683627843856812, + "learning_rate": 2.784653465346535e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.834660530090332, + "num_tokens": 16425217.0, + "step": 451 + }, + { + "epoch": 0.08393686165273909, + "grad_norm": 1.9380472898483276, + "learning_rate": 2.790841584158416e-07, + "loss": 0.5578, + "mean_token_accuracy": 0.8215131759643555, + "num_tokens": 16465567.0, + "step": 452 + }, + { + "epoch": 0.08412256267409471, + "grad_norm": 1.9354809522628784, + "learning_rate": 2.797029702970297e-07, + "loss": 0.5812, + "mean_token_accuracy": 0.8197886943817139, + "num_tokens": 16508083.0, + "step": 453 + }, + { + "epoch": 0.08430826369545033, + "grad_norm": 2.0973753929138184, + "learning_rate": 2.803217821782178e-07, + "loss": 0.5531, + "mean_token_accuracy": 0.8255127668380737, + "num_tokens": 16541582.0, + "step": 454 + }, + { + "epoch": 0.08449396471680594, + "grad_norm": 2.143751859664917, + "learning_rate": 2.80940594059406e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8388532400131226, + "num_tokens": 16582979.0, + "step": 455 + }, + { + "epoch": 0.08467966573816156, + "grad_norm": 1.8402947187423706, + "learning_rate": 2.8155940594059404e-07, + "loss": 0.5639, + "mean_token_accuracy": 0.8208935856819153, + "num_tokens": 16622641.0, + "step": 456 + }, + { + "epoch": 0.08486536675951718, + "grad_norm": 1.8124537467956543, + "learning_rate": 2.8217821782178215e-07, + "loss": 0.568, + "mean_token_accuracy": 0.818731963634491, + "num_tokens": 16668342.0, + "step": 457 + }, + { + "epoch": 0.08505106778087279, + "grad_norm": 1.876834511756897, + "learning_rate": 2.8279702970297026e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.8294557332992554, + "num_tokens": 16706954.0, + "step": 458 + }, + { + "epoch": 0.08523676880222841, + "grad_norm": 1.8379006385803223, + "learning_rate": 2.8341584158415837e-07, + "loss": 0.5244, + "mean_token_accuracy": 0.827756941318512, + "num_tokens": 16742492.0, + "step": 459 + }, + { + "epoch": 0.08542246982358404, + "grad_norm": 1.9275168180465698, + "learning_rate": 2.840346534653465e-07, + "loss": 0.5921, + "mean_token_accuracy": 0.8090651035308838, + "num_tokens": 16782710.0, + "step": 460 + }, + { + "epoch": 0.08560817084493964, + "grad_norm": 1.809140920639038, + "learning_rate": 2.846534653465346e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8454336524009705, + "num_tokens": 16821741.0, + "step": 461 + }, + { + "epoch": 0.08579387186629527, + "grad_norm": 1.883542537689209, + "learning_rate": 2.8527227722772276e-07, + "loss": 0.5565, + "mean_token_accuracy": 0.8236032128334045, + "num_tokens": 16863174.0, + "step": 462 + }, + { + "epoch": 0.08597957288765089, + "grad_norm": 1.8221440315246582, + "learning_rate": 2.8589108910891087e-07, + "loss": 0.5307, + "mean_token_accuracy": 0.8297117948532104, + "num_tokens": 16901288.0, + "step": 463 + }, + { + "epoch": 0.0861652739090065, + "grad_norm": 1.8531345129013062, + "learning_rate": 2.86509900990099e-07, + "loss": 0.503, + "mean_token_accuracy": 0.8403381109237671, + "num_tokens": 16942211.0, + "step": 464 + }, + { + "epoch": 0.08635097493036212, + "grad_norm": 1.9858134984970093, + "learning_rate": 2.871287128712871e-07, + "loss": 0.5526, + "mean_token_accuracy": 0.8252741098403931, + "num_tokens": 16982229.0, + "step": 465 + }, + { + "epoch": 0.08653667595171774, + "grad_norm": 2.2557716369628906, + "learning_rate": 2.877475247524752e-07, + "loss": 0.5023, + "mean_token_accuracy": 0.836287260055542, + "num_tokens": 17018089.0, + "step": 466 + }, + { + "epoch": 0.08672237697307335, + "grad_norm": 1.9320762157440186, + "learning_rate": 2.883663366336633e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8361649513244629, + "num_tokens": 17057151.0, + "step": 467 + }, + { + "epoch": 0.08690807799442897, + "grad_norm": 2.053105115890503, + "learning_rate": 2.889851485148514e-07, + "loss": 0.6007, + "mean_token_accuracy": 0.8168664574623108, + "num_tokens": 17093213.0, + "step": 468 + }, + { + "epoch": 0.08709377901578459, + "grad_norm": 1.8678666353225708, + "learning_rate": 2.896039603960396e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8511762619018555, + "num_tokens": 17130416.0, + "step": 469 + }, + { + "epoch": 0.0872794800371402, + "grad_norm": 2.134854316711426, + "learning_rate": 2.902227722772277e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.8189681768417358, + "num_tokens": 17167018.0, + "step": 470 + }, + { + "epoch": 0.08746518105849582, + "grad_norm": 2.2785377502441406, + "learning_rate": 2.908415841584158e-07, + "loss": 0.502, + "mean_token_accuracy": 0.8386746644973755, + "num_tokens": 17198580.0, + "step": 471 + }, + { + "epoch": 0.08765088207985144, + "grad_norm": 1.8074867725372314, + "learning_rate": 2.914603960396039e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8411403894424438, + "num_tokens": 17237199.0, + "step": 472 + }, + { + "epoch": 0.08783658310120705, + "grad_norm": 2.0160980224609375, + "learning_rate": 2.9207920792079203e-07, + "loss": 0.5408, + "mean_token_accuracy": 0.8268643617630005, + "num_tokens": 17274161.0, + "step": 473 + }, + { + "epoch": 0.08802228412256267, + "grad_norm": 2.19974946975708, + "learning_rate": 2.9269801980198015e-07, + "loss": 0.5575, + "mean_token_accuracy": 0.8185084462165833, + "num_tokens": 17305664.0, + "step": 474 + }, + { + "epoch": 0.0882079851439183, + "grad_norm": 2.0908713340759277, + "learning_rate": 2.933168316831683e-07, + "loss": 0.5229, + "mean_token_accuracy": 0.8307822942733765, + "num_tokens": 17340797.0, + "step": 475 + }, + { + "epoch": 0.0883936861652739, + "grad_norm": 1.8730530738830566, + "learning_rate": 2.939356435643564e-07, + "loss": 0.5097, + "mean_token_accuracy": 0.8374785780906677, + "num_tokens": 17379347.0, + "step": 476 + }, + { + "epoch": 0.08857938718662953, + "grad_norm": 1.95038902759552, + "learning_rate": 2.9455445544554453e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8376567363739014, + "num_tokens": 17415589.0, + "step": 477 + }, + { + "epoch": 0.08876508820798515, + "grad_norm": 1.8845939636230469, + "learning_rate": 2.9517326732673264e-07, + "loss": 0.4358, + "mean_token_accuracy": 0.8549529910087585, + "num_tokens": 17450423.0, + "step": 478 + }, + { + "epoch": 0.08895078922934076, + "grad_norm": 1.8569331169128418, + "learning_rate": 2.9579207920792076e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8369952440261841, + "num_tokens": 17486293.0, + "step": 479 + }, + { + "epoch": 0.08913649025069638, + "grad_norm": 1.789046049118042, + "learning_rate": 2.9641089108910887e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8358830213546753, + "num_tokens": 17527453.0, + "step": 480 + }, + { + "epoch": 0.089322191272052, + "grad_norm": 1.7368075847625732, + "learning_rate": 2.97029702970297e-07, + "loss": 0.5221, + "mean_token_accuracy": 0.8340874314308167, + "num_tokens": 17567060.0, + "step": 481 + }, + { + "epoch": 0.08950789229340761, + "grad_norm": 1.935848355293274, + "learning_rate": 2.9764851485148514e-07, + "loss": 0.5634, + "mean_token_accuracy": 0.8218333125114441, + "num_tokens": 17605194.0, + "step": 482 + }, + { + "epoch": 0.08969359331476323, + "grad_norm": 1.9330434799194336, + "learning_rate": 2.9826732673267325e-07, + "loss": 0.5207, + "mean_token_accuracy": 0.8388301134109497, + "num_tokens": 17642352.0, + "step": 483 + }, + { + "epoch": 0.08987929433611885, + "grad_norm": 1.9701725244522095, + "learning_rate": 2.9888613861386136e-07, + "loss": 0.5687, + "mean_token_accuracy": 0.8168267011642456, + "num_tokens": 17676100.0, + "step": 484 + }, + { + "epoch": 0.09006499535747446, + "grad_norm": 1.922878384590149, + "learning_rate": 2.995049504950495e-07, + "loss": 0.5336, + "mean_token_accuracy": 0.8293920755386353, + "num_tokens": 17711332.0, + "step": 485 + }, + { + "epoch": 0.09025069637883008, + "grad_norm": 1.7991886138916016, + "learning_rate": 3.001237623762376e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.8454499840736389, + "num_tokens": 17749093.0, + "step": 486 + }, + { + "epoch": 0.0904363974001857, + "grad_norm": 1.7885278463363647, + "learning_rate": 3.007425742574257e-07, + "loss": 0.601, + "mean_token_accuracy": 0.8155298233032227, + "num_tokens": 17792173.0, + "step": 487 + }, + { + "epoch": 0.09062209842154131, + "grad_norm": 2.046306610107422, + "learning_rate": 3.0136138613861386e-07, + "loss": 0.5241, + "mean_token_accuracy": 0.831124484539032, + "num_tokens": 17826837.0, + "step": 488 + }, + { + "epoch": 0.09080779944289694, + "grad_norm": 1.8587112426757812, + "learning_rate": 3.01980198019802e-07, + "loss": 0.5215, + "mean_token_accuracy": 0.8340601325035095, + "num_tokens": 17866659.0, + "step": 489 + }, + { + "epoch": 0.09099350046425256, + "grad_norm": 1.731336236000061, + "learning_rate": 3.025990099009901e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8410942554473877, + "num_tokens": 17903287.0, + "step": 490 + }, + { + "epoch": 0.09117920148560817, + "grad_norm": 1.950868844985962, + "learning_rate": 3.032178217821782e-07, + "loss": 0.5477, + "mean_token_accuracy": 0.8273443579673767, + "num_tokens": 17942573.0, + "step": 491 + }, + { + "epoch": 0.09136490250696379, + "grad_norm": 1.8536248207092285, + "learning_rate": 3.038366336633663e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.8380119800567627, + "num_tokens": 17982621.0, + "step": 492 + }, + { + "epoch": 0.09155060352831941, + "grad_norm": 1.8554861545562744, + "learning_rate": 3.044554455445544e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8439081907272339, + "num_tokens": 18019300.0, + "step": 493 + }, + { + "epoch": 0.09173630454967502, + "grad_norm": 1.9067201614379883, + "learning_rate": 3.0507425742574253e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8359537124633789, + "num_tokens": 18057355.0, + "step": 494 + }, + { + "epoch": 0.09192200557103064, + "grad_norm": 1.795691967010498, + "learning_rate": 3.056930693069307e-07, + "loss": 0.5489, + "mean_token_accuracy": 0.8248369097709656, + "num_tokens": 18096070.0, + "step": 495 + }, + { + "epoch": 0.09210770659238626, + "grad_norm": 1.7555733919143677, + "learning_rate": 3.063118811881188e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8414013385772705, + "num_tokens": 18132258.0, + "step": 496 + }, + { + "epoch": 0.09229340761374187, + "grad_norm": 1.8302967548370361, + "learning_rate": 3.069306930693069e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.835955798625946, + "num_tokens": 18171833.0, + "step": 497 + }, + { + "epoch": 0.09247910863509749, + "grad_norm": 1.7305105924606323, + "learning_rate": 3.0754950495049503e-07, + "loss": 0.5053, + "mean_token_accuracy": 0.8411322832107544, + "num_tokens": 18212099.0, + "step": 498 + }, + { + "epoch": 0.09266480965645311, + "grad_norm": 1.8460886478424072, + "learning_rate": 3.0816831683168314e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8397106528282166, + "num_tokens": 18246095.0, + "step": 499 + }, + { + "epoch": 0.09285051067780872, + "grad_norm": 1.8441013097763062, + "learning_rate": 3.0878712871287125e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8358921408653259, + "num_tokens": 18283001.0, + "step": 500 + }, + { + "epoch": 0.09303621169916435, + "grad_norm": 1.7781964540481567, + "learning_rate": 3.0940594059405936e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8441472053527832, + "num_tokens": 18322350.0, + "step": 501 + }, + { + "epoch": 0.09322191272051997, + "grad_norm": 1.8657293319702148, + "learning_rate": 3.1002475247524753e-07, + "loss": 0.5023, + "mean_token_accuracy": 0.8353167772293091, + "num_tokens": 18354721.0, + "step": 502 + }, + { + "epoch": 0.09340761374187558, + "grad_norm": 1.846566915512085, + "learning_rate": 3.1064356435643564e-07, + "loss": 0.5236, + "mean_token_accuracy": 0.832587718963623, + "num_tokens": 18396355.0, + "step": 503 + }, + { + "epoch": 0.0935933147632312, + "grad_norm": 1.666512131690979, + "learning_rate": 3.1126237623762375e-07, + "loss": 0.5345, + "mean_token_accuracy": 0.8297494053840637, + "num_tokens": 18439832.0, + "step": 504 + }, + { + "epoch": 0.09377901578458682, + "grad_norm": 1.784022569656372, + "learning_rate": 3.1188118811881186e-07, + "loss": 0.5406, + "mean_token_accuracy": 0.8326860070228577, + "num_tokens": 18476774.0, + "step": 505 + }, + { + "epoch": 0.09396471680594243, + "grad_norm": 1.784611463546753, + "learning_rate": 3.1249999999999997e-07, + "loss": 0.5402, + "mean_token_accuracy": 0.827741265296936, + "num_tokens": 18512763.0, + "step": 506 + }, + { + "epoch": 0.09415041782729805, + "grad_norm": 1.9944350719451904, + "learning_rate": 3.131188118811881e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8452810049057007, + "num_tokens": 18543627.0, + "step": 507 + }, + { + "epoch": 0.09433611884865367, + "grad_norm": 1.7632540464401245, + "learning_rate": 3.1373762376237625e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.8378505110740662, + "num_tokens": 18582766.0, + "step": 508 + }, + { + "epoch": 0.09452181987000928, + "grad_norm": 1.9711782932281494, + "learning_rate": 3.1435643564356436e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8378986716270447, + "num_tokens": 18615213.0, + "step": 509 + }, + { + "epoch": 0.0947075208913649, + "grad_norm": 1.8236496448516846, + "learning_rate": 3.1497524752475247e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8438131809234619, + "num_tokens": 18654448.0, + "step": 510 + }, + { + "epoch": 0.09489322191272052, + "grad_norm": 1.6912481784820557, + "learning_rate": 3.155940594059406e-07, + "loss": 0.538, + "mean_token_accuracy": 0.8324435949325562, + "num_tokens": 18696781.0, + "step": 511 + }, + { + "epoch": 0.09507892293407613, + "grad_norm": 1.9717155694961548, + "learning_rate": 3.162128712871287e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.842108964920044, + "num_tokens": 18727677.0, + "step": 512 + }, + { + "epoch": 0.09526462395543175, + "grad_norm": 1.8300244808197021, + "learning_rate": 3.168316831683168e-07, + "loss": 0.5537, + "mean_token_accuracy": 0.8213398456573486, + "num_tokens": 18764176.0, + "step": 513 + }, + { + "epoch": 0.09545032497678738, + "grad_norm": 1.728833556175232, + "learning_rate": 3.174504950495049e-07, + "loss": 0.5407, + "mean_token_accuracy": 0.825141966342926, + "num_tokens": 18805680.0, + "step": 514 + }, + { + "epoch": 0.09563602599814298, + "grad_norm": 2.2243576049804688, + "learning_rate": 3.180693069306931e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8389987945556641, + "num_tokens": 18835073.0, + "step": 515 + }, + { + "epoch": 0.0958217270194986, + "grad_norm": 1.8716427087783813, + "learning_rate": 3.186881188118812e-07, + "loss": 0.508, + "mean_token_accuracy": 0.835080623626709, + "num_tokens": 18867709.0, + "step": 516 + }, + { + "epoch": 0.09600742804085423, + "grad_norm": 2.0360798835754395, + "learning_rate": 3.193069306930693e-07, + "loss": 0.5115, + "mean_token_accuracy": 0.8350691795349121, + "num_tokens": 18900262.0, + "step": 517 + }, + { + "epoch": 0.09619312906220984, + "grad_norm": 1.9676443338394165, + "learning_rate": 3.199257425742574e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8363354206085205, + "num_tokens": 18928250.0, + "step": 518 + }, + { + "epoch": 0.09637883008356546, + "grad_norm": 1.8298819065093994, + "learning_rate": 3.205445544554455e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8166000843048096, + "num_tokens": 18963817.0, + "step": 519 + }, + { + "epoch": 0.09656453110492108, + "grad_norm": 1.7997316122055054, + "learning_rate": 3.2116336633663364e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.853374183177948, + "num_tokens": 18996699.0, + "step": 520 + }, + { + "epoch": 0.09675023212627669, + "grad_norm": 1.8185734748840332, + "learning_rate": 3.217821782178218e-07, + "loss": 0.5336, + "mean_token_accuracy": 0.8256894946098328, + "num_tokens": 19031889.0, + "step": 521 + }, + { + "epoch": 0.09693593314763231, + "grad_norm": 1.8714755773544312, + "learning_rate": 3.224009900990099e-07, + "loss": 0.5114, + "mean_token_accuracy": 0.8337975740432739, + "num_tokens": 19068018.0, + "step": 522 + }, + { + "epoch": 0.09712163416898793, + "grad_norm": 1.6294392347335815, + "learning_rate": 3.23019801980198e-07, + "loss": 0.4981, + "mean_token_accuracy": 0.838498592376709, + "num_tokens": 19109760.0, + "step": 523 + }, + { + "epoch": 0.09730733519034354, + "grad_norm": 1.8271156549453735, + "learning_rate": 3.2363861386138614e-07, + "loss": 0.5195, + "mean_token_accuracy": 0.8362114429473877, + "num_tokens": 19144106.0, + "step": 524 + }, + { + "epoch": 0.09749303621169916, + "grad_norm": 1.6706438064575195, + "learning_rate": 3.2425742574257425e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8484335541725159, + "num_tokens": 19181204.0, + "step": 525 + }, + { + "epoch": 0.09767873723305479, + "grad_norm": 1.6253302097320557, + "learning_rate": 3.2487623762376236e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.837227463722229, + "num_tokens": 19223026.0, + "step": 526 + }, + { + "epoch": 0.0978644382544104, + "grad_norm": 1.827972173690796, + "learning_rate": 3.2549504950495047e-07, + "loss": 0.5165, + "mean_token_accuracy": 0.834446132183075, + "num_tokens": 19262921.0, + "step": 527 + }, + { + "epoch": 0.09805013927576602, + "grad_norm": 1.7891566753387451, + "learning_rate": 3.2611386138613863e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8423935770988464, + "num_tokens": 19297078.0, + "step": 528 + }, + { + "epoch": 0.09823584029712164, + "grad_norm": 1.5278303623199463, + "learning_rate": 3.2673267326732674e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8379319310188293, + "num_tokens": 19340629.0, + "step": 529 + }, + { + "epoch": 0.09842154131847725, + "grad_norm": 1.7844899892807007, + "learning_rate": 3.2735148514851486e-07, + "loss": 0.4755, + "mean_token_accuracy": 0.8420858383178711, + "num_tokens": 19375821.0, + "step": 530 + }, + { + "epoch": 0.09860724233983287, + "grad_norm": 1.6323555707931519, + "learning_rate": 3.2797029702970297e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8234074115753174, + "num_tokens": 19418414.0, + "step": 531 + }, + { + "epoch": 0.09879294336118849, + "grad_norm": 1.9366419315338135, + "learning_rate": 3.285891089108911e-07, + "loss": 0.5428, + "mean_token_accuracy": 0.8254796266555786, + "num_tokens": 19453157.0, + "step": 532 + }, + { + "epoch": 0.0989786443825441, + "grad_norm": 1.742398977279663, + "learning_rate": 3.292079207920792e-07, + "loss": 0.5523, + "mean_token_accuracy": 0.8252707123756409, + "num_tokens": 19492060.0, + "step": 533 + }, + { + "epoch": 0.09916434540389972, + "grad_norm": 1.7253055572509766, + "learning_rate": 3.298267326732673e-07, + "loss": 0.4542, + "mean_token_accuracy": 0.8521918654441833, + "num_tokens": 19526499.0, + "step": 534 + }, + { + "epoch": 0.09935004642525534, + "grad_norm": 1.8062299489974976, + "learning_rate": 3.3044554455445547e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8314684629440308, + "num_tokens": 19560808.0, + "step": 535 + }, + { + "epoch": 0.09953574744661095, + "grad_norm": 1.671360731124878, + "learning_rate": 3.310643564356436e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8562952280044556, + "num_tokens": 19598226.0, + "step": 536 + }, + { + "epoch": 0.09972144846796657, + "grad_norm": 1.908894419670105, + "learning_rate": 3.316831683168317e-07, + "loss": 0.539, + "mean_token_accuracy": 0.8273254632949829, + "num_tokens": 19633476.0, + "step": 537 + }, + { + "epoch": 0.0999071494893222, + "grad_norm": 1.7374324798583984, + "learning_rate": 3.323019801980198e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8348839282989502, + "num_tokens": 19673599.0, + "step": 538 + }, + { + "epoch": 0.1000928505106778, + "grad_norm": 1.7049862146377563, + "learning_rate": 3.329207920792079e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8419643640518188, + "num_tokens": 19709798.0, + "step": 539 + }, + { + "epoch": 0.10027855153203342, + "grad_norm": 1.8224891424179077, + "learning_rate": 3.33539603960396e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8437052369117737, + "num_tokens": 19746830.0, + "step": 540 + }, + { + "epoch": 0.10046425255338905, + "grad_norm": 1.7270194292068481, + "learning_rate": 3.341584158415842e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8393008708953857, + "num_tokens": 19782610.0, + "step": 541 + }, + { + "epoch": 0.10064995357474465, + "grad_norm": 1.6765811443328857, + "learning_rate": 3.347772277227723e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8420393466949463, + "num_tokens": 19819783.0, + "step": 542 + }, + { + "epoch": 0.10083565459610028, + "grad_norm": 1.5409190654754639, + "learning_rate": 3.353960396039604e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8425474166870117, + "num_tokens": 19860158.0, + "step": 543 + }, + { + "epoch": 0.1010213556174559, + "grad_norm": 1.8242532014846802, + "learning_rate": 3.360148514851485e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8413755893707275, + "num_tokens": 19892654.0, + "step": 544 + }, + { + "epoch": 0.10120705663881151, + "grad_norm": 1.7505472898483276, + "learning_rate": 3.3663366336633663e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8331897854804993, + "num_tokens": 19925740.0, + "step": 545 + }, + { + "epoch": 0.10139275766016713, + "grad_norm": 1.6602492332458496, + "learning_rate": 3.3725247524752474e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8330788016319275, + "num_tokens": 19964695.0, + "step": 546 + }, + { + "epoch": 0.10157845868152275, + "grad_norm": 1.650656819343567, + "learning_rate": 3.3787128712871285e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.8409056663513184, + "num_tokens": 20010350.0, + "step": 547 + }, + { + "epoch": 0.10176415970287836, + "grad_norm": 1.8557555675506592, + "learning_rate": 3.38490099009901e-07, + "loss": 0.514, + "mean_token_accuracy": 0.8299791216850281, + "num_tokens": 20045250.0, + "step": 548 + }, + { + "epoch": 0.10194986072423398, + "grad_norm": 1.8352298736572266, + "learning_rate": 3.3910891089108913e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.8481549024581909, + "num_tokens": 20080117.0, + "step": 549 + }, + { + "epoch": 0.1021355617455896, + "grad_norm": 1.8678219318389893, + "learning_rate": 3.3972772277227724e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.8289051055908203, + "num_tokens": 20109869.0, + "step": 550 + }, + { + "epoch": 0.10232126276694521, + "grad_norm": 1.6951984167099, + "learning_rate": 3.4034653465346535e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8471819162368774, + "num_tokens": 20147726.0, + "step": 551 + }, + { + "epoch": 0.10250696378830083, + "grad_norm": 1.624566674232483, + "learning_rate": 3.4096534653465346e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8428320288658142, + "num_tokens": 20189417.0, + "step": 552 + }, + { + "epoch": 0.10269266480965646, + "grad_norm": 1.575785517692566, + "learning_rate": 3.415841584158416e-07, + "loss": 0.5192, + "mean_token_accuracy": 0.8353899717330933, + "num_tokens": 20235927.0, + "step": 553 + }, + { + "epoch": 0.10287836583101206, + "grad_norm": 1.6655457019805908, + "learning_rate": 3.4220297029702974e-07, + "loss": 0.4989, + "mean_token_accuracy": 0.8357017040252686, + "num_tokens": 20273186.0, + "step": 554 + }, + { + "epoch": 0.10306406685236769, + "grad_norm": 1.8294023275375366, + "learning_rate": 3.4282178217821785e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.8330848217010498, + "num_tokens": 20308582.0, + "step": 555 + }, + { + "epoch": 0.10324976787372331, + "grad_norm": 1.9374653100967407, + "learning_rate": 3.4344059405940596e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.839945912361145, + "num_tokens": 20338701.0, + "step": 556 + }, + { + "epoch": 0.10343546889507892, + "grad_norm": 1.614524483680725, + "learning_rate": 3.44059405940594e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8481858968734741, + "num_tokens": 20378601.0, + "step": 557 + }, + { + "epoch": 0.10362116991643454, + "grad_norm": 1.6145877838134766, + "learning_rate": 3.4467821782178213e-07, + "loss": 0.4687, + "mean_token_accuracy": 0.8438677787780762, + "num_tokens": 20417182.0, + "step": 558 + }, + { + "epoch": 0.10380687093779016, + "grad_norm": 1.8389887809753418, + "learning_rate": 3.4529702970297024e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8211185336112976, + "num_tokens": 20452730.0, + "step": 559 + }, + { + "epoch": 0.10399257195914577, + "grad_norm": 1.5358150005340576, + "learning_rate": 3.4591584158415835e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8396482467651367, + "num_tokens": 20494944.0, + "step": 560 + }, + { + "epoch": 0.10417827298050139, + "grad_norm": 1.74868905544281, + "learning_rate": 3.465346534653465e-07, + "loss": 0.4938, + "mean_token_accuracy": 0.8355794548988342, + "num_tokens": 20527576.0, + "step": 561 + }, + { + "epoch": 0.10436397400185701, + "grad_norm": 1.7223161458969116, + "learning_rate": 3.4715346534653463e-07, + "loss": 0.5253, + "mean_token_accuracy": 0.8273496627807617, + "num_tokens": 20567762.0, + "step": 562 + }, + { + "epoch": 0.10454967502321262, + "grad_norm": 1.7169710397720337, + "learning_rate": 3.4777227722772274e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.8198044896125793, + "num_tokens": 20608861.0, + "step": 563 + }, + { + "epoch": 0.10473537604456824, + "grad_norm": 1.815171241760254, + "learning_rate": 3.4839108910891085e-07, + "loss": 0.5043, + "mean_token_accuracy": 0.8406663537025452, + "num_tokens": 20644235.0, + "step": 564 + }, + { + "epoch": 0.10492107706592387, + "grad_norm": 1.7133773565292358, + "learning_rate": 3.4900990099009896e-07, + "loss": 0.4483, + "mean_token_accuracy": 0.8490710854530334, + "num_tokens": 20679260.0, + "step": 565 + }, + { + "epoch": 0.10510677808727947, + "grad_norm": 1.5891042947769165, + "learning_rate": 3.496287128712871e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8374161720275879, + "num_tokens": 20725376.0, + "step": 566 + }, + { + "epoch": 0.1052924791086351, + "grad_norm": 1.6623011827468872, + "learning_rate": 3.502475247524752e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8366472125053406, + "num_tokens": 20761226.0, + "step": 567 + }, + { + "epoch": 0.10547818012999072, + "grad_norm": 1.7687681913375854, + "learning_rate": 3.5086633663366335e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8483437299728394, + "num_tokens": 20794369.0, + "step": 568 + }, + { + "epoch": 0.10566388115134633, + "grad_norm": 1.5567145347595215, + "learning_rate": 3.5148514851485146e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8404388427734375, + "num_tokens": 20834002.0, + "step": 569 + }, + { + "epoch": 0.10584958217270195, + "grad_norm": 1.714025855064392, + "learning_rate": 3.5210396039603957e-07, + "loss": 0.5215, + "mean_token_accuracy": 0.8316373229026794, + "num_tokens": 20874607.0, + "step": 570 + }, + { + "epoch": 0.10603528319405757, + "grad_norm": 1.5320414304733276, + "learning_rate": 3.527227722772277e-07, + "loss": 0.5229, + "mean_token_accuracy": 0.8342320322990417, + "num_tokens": 20918785.0, + "step": 571 + }, + { + "epoch": 0.10622098421541319, + "grad_norm": 1.7238081693649292, + "learning_rate": 3.533415841584158e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8423163890838623, + "num_tokens": 20955928.0, + "step": 572 + }, + { + "epoch": 0.1064066852367688, + "grad_norm": 1.6822470426559448, + "learning_rate": 3.539603960396039e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8297192454338074, + "num_tokens": 20994302.0, + "step": 573 + }, + { + "epoch": 0.10659238625812442, + "grad_norm": 1.6720366477966309, + "learning_rate": 3.5457920792079207e-07, + "loss": 0.5386, + "mean_token_accuracy": 0.829151451587677, + "num_tokens": 21033235.0, + "step": 574 + }, + { + "epoch": 0.10677808727948004, + "grad_norm": 1.6591477394104004, + "learning_rate": 3.551980198019802e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.829524040222168, + "num_tokens": 21072262.0, + "step": 575 + }, + { + "epoch": 0.10696378830083565, + "grad_norm": 1.6300867795944214, + "learning_rate": 3.558168316831683e-07, + "loss": 0.5311, + "mean_token_accuracy": 0.8252575397491455, + "num_tokens": 21113033.0, + "step": 576 + }, + { + "epoch": 0.10714948932219127, + "grad_norm": 1.6938514709472656, + "learning_rate": 3.564356435643564e-07, + "loss": 0.5528, + "mean_token_accuracy": 0.820501446723938, + "num_tokens": 21151285.0, + "step": 577 + }, + { + "epoch": 0.1073351903435469, + "grad_norm": 1.6582939624786377, + "learning_rate": 3.570544554455445e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.8333945870399475, + "num_tokens": 21187124.0, + "step": 578 + }, + { + "epoch": 0.1075208913649025, + "grad_norm": 1.686386227607727, + "learning_rate": 3.5767326732673263e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8423062562942505, + "num_tokens": 21223354.0, + "step": 579 + }, + { + "epoch": 0.10770659238625813, + "grad_norm": 1.6093298196792603, + "learning_rate": 3.5829207920792074e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.8437772393226624, + "num_tokens": 21257069.0, + "step": 580 + }, + { + "epoch": 0.10789229340761375, + "grad_norm": 1.5260944366455078, + "learning_rate": 3.589108910891089e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8423726558685303, + "num_tokens": 21298481.0, + "step": 581 + }, + { + "epoch": 0.10807799442896936, + "grad_norm": 1.802343487739563, + "learning_rate": 3.59529702970297e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.8306587338447571, + "num_tokens": 21339161.0, + "step": 582 + }, + { + "epoch": 0.10826369545032498, + "grad_norm": 1.7642027139663696, + "learning_rate": 3.601485148514851e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.8485210537910461, + "num_tokens": 21372074.0, + "step": 583 + }, + { + "epoch": 0.1084493964716806, + "grad_norm": 1.7170226573944092, + "learning_rate": 3.6076732673267324e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.8363314270973206, + "num_tokens": 21407662.0, + "step": 584 + }, + { + "epoch": 0.10863509749303621, + "grad_norm": 1.5515711307525635, + "learning_rate": 3.6138613861386135e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8435603976249695, + "num_tokens": 21448752.0, + "step": 585 + }, + { + "epoch": 0.10882079851439183, + "grad_norm": 1.749977469444275, + "learning_rate": 3.6200495049504946e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.850644052028656, + "num_tokens": 21486885.0, + "step": 586 + }, + { + "epoch": 0.10900649953574745, + "grad_norm": 2.0103800296783447, + "learning_rate": 3.6262376237623757e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8453103303909302, + "num_tokens": 21516651.0, + "step": 587 + }, + { + "epoch": 0.10919220055710306, + "grad_norm": 1.8093748092651367, + "learning_rate": 3.6324257425742574e-07, + "loss": 0.5499, + "mean_token_accuracy": 0.8265542387962341, + "num_tokens": 21553040.0, + "step": 588 + }, + { + "epoch": 0.10937790157845868, + "grad_norm": 1.6715028285980225, + "learning_rate": 3.6386138613861385e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.8402878046035767, + "num_tokens": 21592503.0, + "step": 589 + }, + { + "epoch": 0.1095636025998143, + "grad_norm": 1.8436719179153442, + "learning_rate": 3.6448019801980196e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8324528336524963, + "num_tokens": 21631548.0, + "step": 590 + }, + { + "epoch": 0.10974930362116991, + "grad_norm": 1.7223507165908813, + "learning_rate": 3.6509900990099007e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8448682427406311, + "num_tokens": 21664712.0, + "step": 591 + }, + { + "epoch": 0.10993500464252554, + "grad_norm": 1.784918189048767, + "learning_rate": 3.657178217821782e-07, + "loss": 0.5862, + "mean_token_accuracy": 0.8198175430297852, + "num_tokens": 21703056.0, + "step": 592 + }, + { + "epoch": 0.11012070566388116, + "grad_norm": 1.7381131649017334, + "learning_rate": 3.663366336633663e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8311951160430908, + "num_tokens": 21741132.0, + "step": 593 + }, + { + "epoch": 0.11030640668523677, + "grad_norm": 1.5273984670639038, + "learning_rate": 3.6695544554455446e-07, + "loss": 0.49, + "mean_token_accuracy": 0.83889240026474, + "num_tokens": 21785042.0, + "step": 594 + }, + { + "epoch": 0.11049210770659239, + "grad_norm": 1.7481865882873535, + "learning_rate": 3.6757425742574257e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8277322053909302, + "num_tokens": 21821601.0, + "step": 595 + }, + { + "epoch": 0.11067780872794801, + "grad_norm": 2.044297933578491, + "learning_rate": 3.681930693069307e-07, + "loss": 0.5951, + "mean_token_accuracy": 0.8195481300354004, + "num_tokens": 21854227.0, + "step": 596 + }, + { + "epoch": 0.11086350974930362, + "grad_norm": 1.7665752172470093, + "learning_rate": 3.688118811881188e-07, + "loss": 0.475, + "mean_token_accuracy": 0.8450273275375366, + "num_tokens": 21886257.0, + "step": 597 + }, + { + "epoch": 0.11104921077065924, + "grad_norm": 1.7536596059799194, + "learning_rate": 3.694306930693069e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8332566022872925, + "num_tokens": 21921941.0, + "step": 598 + }, + { + "epoch": 0.11123491179201486, + "grad_norm": 1.8458713293075562, + "learning_rate": 3.70049504950495e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8426470160484314, + "num_tokens": 21953211.0, + "step": 599 + }, + { + "epoch": 0.11142061281337047, + "grad_norm": 1.7217024564743042, + "learning_rate": 3.706683168316831e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8390963077545166, + "num_tokens": 21990750.0, + "step": 600 + }, + { + "epoch": 0.11160631383472609, + "grad_norm": 1.7841565608978271, + "learning_rate": 3.712871287128713e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8422154784202576, + "num_tokens": 22027535.0, + "step": 601 + }, + { + "epoch": 0.11179201485608171, + "grad_norm": 1.611831545829773, + "learning_rate": 3.719059405940594e-07, + "loss": 0.52, + "mean_token_accuracy": 0.8327088356018066, + "num_tokens": 22067668.0, + "step": 602 + }, + { + "epoch": 0.11197771587743732, + "grad_norm": 1.678972840309143, + "learning_rate": 3.725247524752475e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8366981744766235, + "num_tokens": 22104170.0, + "step": 603 + }, + { + "epoch": 0.11216341689879294, + "grad_norm": 1.635759949684143, + "learning_rate": 3.731435643564356e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8584191799163818, + "num_tokens": 22139057.0, + "step": 604 + }, + { + "epoch": 0.11234911792014857, + "grad_norm": 1.582102656364441, + "learning_rate": 3.7376237623762373e-07, + "loss": 0.4268, + "mean_token_accuracy": 0.8600362539291382, + "num_tokens": 22175806.0, + "step": 605 + }, + { + "epoch": 0.11253481894150417, + "grad_norm": 1.705242395401001, + "learning_rate": 3.7438118811881185e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.8256412744522095, + "num_tokens": 22213581.0, + "step": 606 + }, + { + "epoch": 0.1127205199628598, + "grad_norm": 1.5332374572753906, + "learning_rate": 3.75e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8450785875320435, + "num_tokens": 22255216.0, + "step": 607 + }, + { + "epoch": 0.11290622098421542, + "grad_norm": 1.773079514503479, + "learning_rate": 3.756188118811881e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8457003235816956, + "num_tokens": 22286548.0, + "step": 608 + }, + { + "epoch": 0.11309192200557103, + "grad_norm": 1.7879204750061035, + "learning_rate": 3.7623762376237623e-07, + "loss": 0.4441, + "mean_token_accuracy": 0.8536774516105652, + "num_tokens": 22322500.0, + "step": 609 + }, + { + "epoch": 0.11327762302692665, + "grad_norm": 1.741539478302002, + "learning_rate": 3.7685643564356434e-07, + "loss": 0.498, + "mean_token_accuracy": 0.8360504508018494, + "num_tokens": 22360737.0, + "step": 610 + }, + { + "epoch": 0.11346332404828227, + "grad_norm": 1.8680919408798218, + "learning_rate": 3.7747524752475245e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8425346612930298, + "num_tokens": 22390469.0, + "step": 611 + }, + { + "epoch": 0.11364902506963788, + "grad_norm": 1.8240264654159546, + "learning_rate": 3.7809405940594057e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8373618125915527, + "num_tokens": 22423496.0, + "step": 612 + }, + { + "epoch": 0.1138347260909935, + "grad_norm": 1.7959682941436768, + "learning_rate": 3.787128712871287e-07, + "loss": 0.4972, + "mean_token_accuracy": 0.8395775556564331, + "num_tokens": 22459317.0, + "step": 613 + }, + { + "epoch": 0.11402042711234912, + "grad_norm": 1.5299568176269531, + "learning_rate": 3.7933168316831684e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8478765487670898, + "num_tokens": 22499412.0, + "step": 614 + }, + { + "epoch": 0.11420612813370473, + "grad_norm": 1.550770878791809, + "learning_rate": 3.7995049504950495e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8415618538856506, + "num_tokens": 22539224.0, + "step": 615 + }, + { + "epoch": 0.11439182915506035, + "grad_norm": 1.651060938835144, + "learning_rate": 3.8056930693069306e-07, + "loss": 0.5539, + "mean_token_accuracy": 0.8215432167053223, + "num_tokens": 22582015.0, + "step": 616 + }, + { + "epoch": 0.11457753017641598, + "grad_norm": 1.6972688436508179, + "learning_rate": 3.811881188118812e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8446073532104492, + "num_tokens": 22617530.0, + "step": 617 + }, + { + "epoch": 0.11476323119777158, + "grad_norm": 1.616962194442749, + "learning_rate": 3.818069306930693e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8381170034408569, + "num_tokens": 22657767.0, + "step": 618 + }, + { + "epoch": 0.1149489322191272, + "grad_norm": 1.752884030342102, + "learning_rate": 3.824257425742574e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.8554533123970032, + "num_tokens": 22689095.0, + "step": 619 + }, + { + "epoch": 0.11513463324048283, + "grad_norm": 1.6086541414260864, + "learning_rate": 3.830445544554455e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8405457139015198, + "num_tokens": 22726568.0, + "step": 620 + }, + { + "epoch": 0.11532033426183844, + "grad_norm": 1.6070679426193237, + "learning_rate": 3.836633663366337e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.8488445281982422, + "num_tokens": 22766041.0, + "step": 621 + }, + { + "epoch": 0.11550603528319406, + "grad_norm": 1.7329134941101074, + "learning_rate": 3.842821782178218e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.8339089155197144, + "num_tokens": 22801600.0, + "step": 622 + }, + { + "epoch": 0.11569173630454968, + "grad_norm": 1.745592474937439, + "learning_rate": 3.849009900990099e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8450073599815369, + "num_tokens": 22837476.0, + "step": 623 + }, + { + "epoch": 0.11587743732590529, + "grad_norm": 1.6646995544433594, + "learning_rate": 3.85519801980198e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8469041585922241, + "num_tokens": 22869985.0, + "step": 624 + }, + { + "epoch": 0.11606313834726091, + "grad_norm": 1.7482236623764038, + "learning_rate": 3.861386138613861e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8353041410446167, + "num_tokens": 22907385.0, + "step": 625 + }, + { + "epoch": 0.11624883936861653, + "grad_norm": 1.6072161197662354, + "learning_rate": 3.8675742574257423e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8445567488670349, + "num_tokens": 22944857.0, + "step": 626 + }, + { + "epoch": 0.11643454038997214, + "grad_norm": 1.7496461868286133, + "learning_rate": 3.873762376237624e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.843334436416626, + "num_tokens": 22979193.0, + "step": 627 + }, + { + "epoch": 0.11662024141132776, + "grad_norm": 1.5540517568588257, + "learning_rate": 3.879950495049505e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8469566106796265, + "num_tokens": 23019486.0, + "step": 628 + }, + { + "epoch": 0.11680594243268339, + "grad_norm": 1.502274751663208, + "learning_rate": 3.886138613861386e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8498680591583252, + "num_tokens": 23063065.0, + "step": 629 + }, + { + "epoch": 0.116991643454039, + "grad_norm": 1.6210949420928955, + "learning_rate": 3.8923267326732673e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8373868465423584, + "num_tokens": 23098537.0, + "step": 630 + }, + { + "epoch": 0.11717734447539462, + "grad_norm": 1.6964654922485352, + "learning_rate": 3.8985148514851484e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8388627171516418, + "num_tokens": 23133820.0, + "step": 631 + }, + { + "epoch": 0.11736304549675024, + "grad_norm": 1.6408265829086304, + "learning_rate": 3.9047029702970295e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.8413575887680054, + "num_tokens": 23172623.0, + "step": 632 + }, + { + "epoch": 0.11754874651810585, + "grad_norm": 1.7221596240997314, + "learning_rate": 3.9108910891089106e-07, + "loss": 0.5654, + "mean_token_accuracy": 0.8195146322250366, + "num_tokens": 23211615.0, + "step": 633 + }, + { + "epoch": 0.11773444753946147, + "grad_norm": 1.6567840576171875, + "learning_rate": 3.9170792079207923e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8564414978027344, + "num_tokens": 23245398.0, + "step": 634 + }, + { + "epoch": 0.11792014856081709, + "grad_norm": 1.6062994003295898, + "learning_rate": 3.9232673267326734e-07, + "loss": 0.4223, + "mean_token_accuracy": 0.8596598505973816, + "num_tokens": 23282962.0, + "step": 635 + }, + { + "epoch": 0.1181058495821727, + "grad_norm": 1.8104171752929688, + "learning_rate": 3.9294554455445545e-07, + "loss": 0.5347, + "mean_token_accuracy": 0.8266956210136414, + "num_tokens": 23318630.0, + "step": 636 + }, + { + "epoch": 0.11829155060352832, + "grad_norm": 1.6642340421676636, + "learning_rate": 3.9356435643564356e-07, + "loss": 0.5248, + "mean_token_accuracy": 0.8295465707778931, + "num_tokens": 23357231.0, + "step": 637 + }, + { + "epoch": 0.11847725162488394, + "grad_norm": 1.7644562721252441, + "learning_rate": 3.9418316831683167e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.8309240937232971, + "num_tokens": 23391308.0, + "step": 638 + }, + { + "epoch": 0.11866295264623955, + "grad_norm": 1.8307145833969116, + "learning_rate": 3.948019801980198e-07, + "loss": 0.4963, + "mean_token_accuracy": 0.8342198133468628, + "num_tokens": 23421514.0, + "step": 639 + }, + { + "epoch": 0.11884865366759517, + "grad_norm": 1.6811249256134033, + "learning_rate": 3.954207920792079e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8550790548324585, + "num_tokens": 23455152.0, + "step": 640 + }, + { + "epoch": 0.1190343546889508, + "grad_norm": 1.752500295639038, + "learning_rate": 3.9603960396039606e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.8533394932746887, + "num_tokens": 23490250.0, + "step": 641 + }, + { + "epoch": 0.1192200557103064, + "grad_norm": 1.5529401302337646, + "learning_rate": 3.9665841584158417e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.842458963394165, + "num_tokens": 23533256.0, + "step": 642 + }, + { + "epoch": 0.11940575673166202, + "grad_norm": 1.7101507186889648, + "learning_rate": 3.972772277227723e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8385646939277649, + "num_tokens": 23567323.0, + "step": 643 + }, + { + "epoch": 0.11959145775301765, + "grad_norm": 1.700891137123108, + "learning_rate": 3.978960396039604e-07, + "loss": 0.5104, + "mean_token_accuracy": 0.8353382349014282, + "num_tokens": 23603515.0, + "step": 644 + }, + { + "epoch": 0.11977715877437325, + "grad_norm": 1.7242751121520996, + "learning_rate": 3.985148514851485e-07, + "loss": 0.498, + "mean_token_accuracy": 0.836825966835022, + "num_tokens": 23637505.0, + "step": 645 + }, + { + "epoch": 0.11996285979572888, + "grad_norm": 1.6429157257080078, + "learning_rate": 3.991336633663366e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8512842655181885, + "num_tokens": 23674425.0, + "step": 646 + }, + { + "epoch": 0.1201485608170845, + "grad_norm": 1.5960047245025635, + "learning_rate": 3.997524752475248e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8298210501670837, + "num_tokens": 23715374.0, + "step": 647 + }, + { + "epoch": 0.1203342618384401, + "grad_norm": 1.685814619064331, + "learning_rate": 4.003712871287129e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8395642638206482, + "num_tokens": 23749342.0, + "step": 648 + }, + { + "epoch": 0.12051996285979573, + "grad_norm": 1.6523722410202026, + "learning_rate": 4.00990099009901e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8404354453086853, + "num_tokens": 23787410.0, + "step": 649 + }, + { + "epoch": 0.12070566388115135, + "grad_norm": 1.6017284393310547, + "learning_rate": 4.016089108910891e-07, + "loss": 0.501, + "mean_token_accuracy": 0.8348616361618042, + "num_tokens": 23825587.0, + "step": 650 + }, + { + "epoch": 0.12089136490250696, + "grad_norm": 1.680623173713684, + "learning_rate": 4.022277227722772e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8403103947639465, + "num_tokens": 23863221.0, + "step": 651 + }, + { + "epoch": 0.12107706592386258, + "grad_norm": 1.673704743385315, + "learning_rate": 4.0284653465346534e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.8561486601829529, + "num_tokens": 23898802.0, + "step": 652 + }, + { + "epoch": 0.1212627669452182, + "grad_norm": 1.8075003623962402, + "learning_rate": 4.0346534653465345e-07, + "loss": 0.5063, + "mean_token_accuracy": 0.8362687230110168, + "num_tokens": 23929556.0, + "step": 653 + }, + { + "epoch": 0.12144846796657381, + "grad_norm": 1.6371432542800903, + "learning_rate": 4.040841584158416e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.83266282081604, + "num_tokens": 23971276.0, + "step": 654 + }, + { + "epoch": 0.12163416898792943, + "grad_norm": 1.7849608659744263, + "learning_rate": 4.047029702970297e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.838053822517395, + "num_tokens": 24002504.0, + "step": 655 + }, + { + "epoch": 0.12181987000928506, + "grad_norm": 1.7410662174224854, + "learning_rate": 4.0532178217821783e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8461110591888428, + "num_tokens": 24037574.0, + "step": 656 + }, + { + "epoch": 0.12200557103064066, + "grad_norm": 1.7084591388702393, + "learning_rate": 4.0594059405940595e-07, + "loss": 0.4383, + "mean_token_accuracy": 0.8557016849517822, + "num_tokens": 24069513.0, + "step": 657 + }, + { + "epoch": 0.12219127205199629, + "grad_norm": 1.6723361015319824, + "learning_rate": 4.06559405940594e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8442301154136658, + "num_tokens": 24104202.0, + "step": 658 + }, + { + "epoch": 0.12237697307335191, + "grad_norm": 1.6241955757141113, + "learning_rate": 4.071782178217821e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8450983762741089, + "num_tokens": 24142101.0, + "step": 659 + }, + { + "epoch": 0.12256267409470752, + "grad_norm": 1.7821537256240845, + "learning_rate": 4.0779702970297023e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.842862606048584, + "num_tokens": 24176115.0, + "step": 660 + }, + { + "epoch": 0.12274837511606314, + "grad_norm": 1.800782322883606, + "learning_rate": 4.084158415841584e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.836855947971344, + "num_tokens": 24206564.0, + "step": 661 + }, + { + "epoch": 0.12293407613741876, + "grad_norm": 1.9220027923583984, + "learning_rate": 4.090346534653465e-07, + "loss": 0.5556, + "mean_token_accuracy": 0.8196845054626465, + "num_tokens": 24238164.0, + "step": 662 + }, + { + "epoch": 0.12311977715877437, + "grad_norm": 1.537522554397583, + "learning_rate": 4.096534653465346e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.847728431224823, + "num_tokens": 24277646.0, + "step": 663 + }, + { + "epoch": 0.12330547818012999, + "grad_norm": 1.6892238855361938, + "learning_rate": 4.102722772277227e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8335249423980713, + "num_tokens": 24314353.0, + "step": 664 + }, + { + "epoch": 0.12349117920148561, + "grad_norm": 1.7057063579559326, + "learning_rate": 4.1089108910891084e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.844129741191864, + "num_tokens": 24349263.0, + "step": 665 + }, + { + "epoch": 0.12367688022284122, + "grad_norm": 1.8396267890930176, + "learning_rate": 4.1150990099009895e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.835634708404541, + "num_tokens": 24382563.0, + "step": 666 + }, + { + "epoch": 0.12386258124419684, + "grad_norm": 1.6914552450180054, + "learning_rate": 4.121287128712871e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8416524529457092, + "num_tokens": 24417998.0, + "step": 667 + }, + { + "epoch": 0.12404828226555246, + "grad_norm": 1.7446746826171875, + "learning_rate": 4.127475247524752e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8523446321487427, + "num_tokens": 24447330.0, + "step": 668 + }, + { + "epoch": 0.12423398328690807, + "grad_norm": 1.9324209690093994, + "learning_rate": 4.1336633663366333e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8396952152252197, + "num_tokens": 24477373.0, + "step": 669 + }, + { + "epoch": 0.1244196843082637, + "grad_norm": 1.6638100147247314, + "learning_rate": 4.1398514851485145e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.8480604887008667, + "num_tokens": 24512894.0, + "step": 670 + }, + { + "epoch": 0.12460538532961932, + "grad_norm": 1.5155761241912842, + "learning_rate": 4.1460396039603956e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8489493131637573, + "num_tokens": 24553466.0, + "step": 671 + }, + { + "epoch": 0.12479108635097493, + "grad_norm": 1.7812966108322144, + "learning_rate": 4.1522277227722767e-07, + "loss": 0.4224, + "mean_token_accuracy": 0.8588248491287231, + "num_tokens": 24583785.0, + "step": 672 + }, + { + "epoch": 0.12497678737233055, + "grad_norm": 1.7541613578796387, + "learning_rate": 4.158415841584158e-07, + "loss": 0.5267, + "mean_token_accuracy": 0.8291702270507812, + "num_tokens": 24616071.0, + "step": 673 + }, + { + "epoch": 0.12516248839368616, + "grad_norm": 1.6678062677383423, + "learning_rate": 4.1646039603960394e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8440574407577515, + "num_tokens": 24651817.0, + "step": 674 + }, + { + "epoch": 0.12534818941504178, + "grad_norm": 1.8027539253234863, + "learning_rate": 4.1707920792079206e-07, + "loss": 0.5324, + "mean_token_accuracy": 0.8296906352043152, + "num_tokens": 24685760.0, + "step": 675 + }, + { + "epoch": 0.1255338904363974, + "grad_norm": 1.8054654598236084, + "learning_rate": 4.1769801980198017e-07, + "loss": 0.4589, + "mean_token_accuracy": 0.8514273762702942, + "num_tokens": 24718153.0, + "step": 676 + }, + { + "epoch": 0.12571959145775302, + "grad_norm": 1.6462867259979248, + "learning_rate": 4.183168316831683e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.8417646884918213, + "num_tokens": 24754415.0, + "step": 677 + }, + { + "epoch": 0.12590529247910864, + "grad_norm": 1.582537055015564, + "learning_rate": 4.189356435643564e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8431377410888672, + "num_tokens": 24793503.0, + "step": 678 + }, + { + "epoch": 0.12609099350046427, + "grad_norm": 1.5910742282867432, + "learning_rate": 4.195544554455445e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.843808650970459, + "num_tokens": 24829736.0, + "step": 679 + }, + { + "epoch": 0.12627669452181986, + "grad_norm": 1.515458345413208, + "learning_rate": 4.2017326732673266e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8549364805221558, + "num_tokens": 24870536.0, + "step": 680 + }, + { + "epoch": 0.12646239554317548, + "grad_norm": 1.7822328805923462, + "learning_rate": 4.207920792079208e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8511645793914795, + "num_tokens": 24904635.0, + "step": 681 + }, + { + "epoch": 0.1266480965645311, + "grad_norm": 1.66118586063385, + "learning_rate": 4.214108910891089e-07, + "loss": 0.5192, + "mean_token_accuracy": 0.8307396769523621, + "num_tokens": 24938861.0, + "step": 682 + }, + { + "epoch": 0.12683379758588673, + "grad_norm": 1.617479920387268, + "learning_rate": 4.22029702970297e-07, + "loss": 0.4117, + "mean_token_accuracy": 0.862377405166626, + "num_tokens": 24972882.0, + "step": 683 + }, + { + "epoch": 0.12701949860724235, + "grad_norm": 1.5797631740570068, + "learning_rate": 4.226485148514851e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8401862382888794, + "num_tokens": 25011895.0, + "step": 684 + }, + { + "epoch": 0.12720519962859797, + "grad_norm": 1.5611422061920166, + "learning_rate": 4.232673267326732e-07, + "loss": 0.4177, + "mean_token_accuracy": 0.861930787563324, + "num_tokens": 25046314.0, + "step": 685 + }, + { + "epoch": 0.12739090064995356, + "grad_norm": 1.6761192083358765, + "learning_rate": 4.2388613861386133e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8431137204170227, + "num_tokens": 25082681.0, + "step": 686 + }, + { + "epoch": 0.1275766016713092, + "grad_norm": 1.8065755367279053, + "learning_rate": 4.245049504950495e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8483721017837524, + "num_tokens": 25114709.0, + "step": 687 + }, + { + "epoch": 0.1277623026926648, + "grad_norm": 1.725322961807251, + "learning_rate": 4.251237623762376e-07, + "loss": 0.5499, + "mean_token_accuracy": 0.8218761682510376, + "num_tokens": 25152290.0, + "step": 688 + }, + { + "epoch": 0.12794800371402043, + "grad_norm": 1.6312742233276367, + "learning_rate": 4.257425742574257e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.8387904167175293, + "num_tokens": 25189610.0, + "step": 689 + }, + { + "epoch": 0.12813370473537605, + "grad_norm": 1.704898476600647, + "learning_rate": 4.2636138613861383e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8500596284866333, + "num_tokens": 25221738.0, + "step": 690 + }, + { + "epoch": 0.12831940575673167, + "grad_norm": 1.6912661790847778, + "learning_rate": 4.2698019801980194e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8561491966247559, + "num_tokens": 25254955.0, + "step": 691 + }, + { + "epoch": 0.12850510677808727, + "grad_norm": 1.5143885612487793, + "learning_rate": 4.2759900990099005e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.8582675457000732, + "num_tokens": 25293959.0, + "step": 692 + }, + { + "epoch": 0.1286908077994429, + "grad_norm": 1.8034615516662598, + "learning_rate": 4.2821782178217816e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8384144306182861, + "num_tokens": 25325301.0, + "step": 693 + }, + { + "epoch": 0.1288765088207985, + "grad_norm": 1.7786548137664795, + "learning_rate": 4.2883663366336633e-07, + "loss": 0.5328, + "mean_token_accuracy": 0.826202392578125, + "num_tokens": 25364175.0, + "step": 694 + }, + { + "epoch": 0.12906220984215414, + "grad_norm": 1.6732923984527588, + "learning_rate": 4.2945544554455444e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8457670211791992, + "num_tokens": 25399291.0, + "step": 695 + }, + { + "epoch": 0.12924791086350976, + "grad_norm": 1.9173334836959839, + "learning_rate": 4.3007425742574255e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8460980653762817, + "num_tokens": 25427349.0, + "step": 696 + }, + { + "epoch": 0.12943361188486538, + "grad_norm": 1.6701080799102783, + "learning_rate": 4.3069306930693066e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.847484290599823, + "num_tokens": 25461252.0, + "step": 697 + }, + { + "epoch": 0.12961931290622097, + "grad_norm": 1.9128960371017456, + "learning_rate": 4.313118811881188e-07, + "loss": 0.5438, + "mean_token_accuracy": 0.8266684412956238, + "num_tokens": 25491289.0, + "step": 698 + }, + { + "epoch": 0.1298050139275766, + "grad_norm": 1.8411848545074463, + "learning_rate": 4.319306930693069e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8304078578948975, + "num_tokens": 25522022.0, + "step": 699 + }, + { + "epoch": 0.12999071494893222, + "grad_norm": 1.5379477739334106, + "learning_rate": 4.3254950495049505e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.8598184585571289, + "num_tokens": 25558520.0, + "step": 700 + }, + { + "epoch": 0.13017641597028784, + "grad_norm": 1.7422990798950195, + "learning_rate": 4.3316831683168316e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8266115188598633, + "num_tokens": 25596201.0, + "step": 701 + }, + { + "epoch": 0.13036211699164346, + "grad_norm": 1.6530314683914185, + "learning_rate": 4.3378712871287127e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8439878225326538, + "num_tokens": 25634062.0, + "step": 702 + }, + { + "epoch": 0.13054781801299908, + "grad_norm": 1.6369075775146484, + "learning_rate": 4.344059405940594e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8525375127792358, + "num_tokens": 25670991.0, + "step": 703 + }, + { + "epoch": 0.13073351903435468, + "grad_norm": 1.7538877725601196, + "learning_rate": 4.350247524752475e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8428654670715332, + "num_tokens": 25705465.0, + "step": 704 + }, + { + "epoch": 0.1309192200557103, + "grad_norm": 1.6768485307693481, + "learning_rate": 4.356435643564356e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8368866443634033, + "num_tokens": 25739277.0, + "step": 705 + }, + { + "epoch": 0.13110492107706592, + "grad_norm": 1.629978060722351, + "learning_rate": 4.362623762376237e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8528372049331665, + "num_tokens": 25774765.0, + "step": 706 + }, + { + "epoch": 0.13129062209842154, + "grad_norm": 1.603540301322937, + "learning_rate": 4.368811881188119e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8477466106414795, + "num_tokens": 25808153.0, + "step": 707 + }, + { + "epoch": 0.13147632311977717, + "grad_norm": 1.714062213897705, + "learning_rate": 4.375e-07, + "loss": 0.4676, + "mean_token_accuracy": 0.8484161496162415, + "num_tokens": 25840242.0, + "step": 708 + }, + { + "epoch": 0.1316620241411328, + "grad_norm": 1.6661890745162964, + "learning_rate": 4.381188118811881e-07, + "loss": 0.4989, + "mean_token_accuracy": 0.8359367847442627, + "num_tokens": 25877868.0, + "step": 709 + }, + { + "epoch": 0.13184772516248838, + "grad_norm": 1.6554278135299683, + "learning_rate": 4.387376237623762e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8420897722244263, + "num_tokens": 25911865.0, + "step": 710 + }, + { + "epoch": 0.132033426183844, + "grad_norm": 1.751333236694336, + "learning_rate": 4.3935643564356433e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.852662205696106, + "num_tokens": 25946499.0, + "step": 711 + }, + { + "epoch": 0.13221912720519963, + "grad_norm": 1.7400239706039429, + "learning_rate": 4.3997524752475244e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8346745371818542, + "num_tokens": 25980158.0, + "step": 712 + }, + { + "epoch": 0.13240482822655525, + "grad_norm": 1.6825282573699951, + "learning_rate": 4.405940594059406e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.8444312214851379, + "num_tokens": 26014516.0, + "step": 713 + }, + { + "epoch": 0.13259052924791087, + "grad_norm": 1.5963932275772095, + "learning_rate": 4.412128712871287e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8475759029388428, + "num_tokens": 26049257.0, + "step": 714 + }, + { + "epoch": 0.1327762302692665, + "grad_norm": 1.7639960050582886, + "learning_rate": 4.418316831683168e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.8481186628341675, + "num_tokens": 26084361.0, + "step": 715 + }, + { + "epoch": 0.1329619312906221, + "grad_norm": 1.7541604042053223, + "learning_rate": 4.4245049504950494e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8434625267982483, + "num_tokens": 26120738.0, + "step": 716 + }, + { + "epoch": 0.1331476323119777, + "grad_norm": 1.6304280757904053, + "learning_rate": 4.4306930693069305e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.8408646583557129, + "num_tokens": 26160307.0, + "step": 717 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.5552202463150024, + "learning_rate": 4.4368811881188116e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8349318504333496, + "num_tokens": 26199010.0, + "step": 718 + }, + { + "epoch": 0.13351903435468895, + "grad_norm": 1.5842242240905762, + "learning_rate": 4.4430693069306927e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8383473753929138, + "num_tokens": 26237825.0, + "step": 719 + }, + { + "epoch": 0.13370473537604458, + "grad_norm": 1.612735629081726, + "learning_rate": 4.4492574257425744e-07, + "loss": 0.5092, + "mean_token_accuracy": 0.8349506855010986, + "num_tokens": 26277699.0, + "step": 720 + }, + { + "epoch": 0.1338904363974002, + "grad_norm": 1.6182758808135986, + "learning_rate": 4.4554455445544555e-07, + "loss": 0.5336, + "mean_token_accuracy": 0.8224008083343506, + "num_tokens": 26320728.0, + "step": 721 + }, + { + "epoch": 0.1340761374187558, + "grad_norm": 1.4582949876785278, + "learning_rate": 4.4616336633663366e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8485730290412903, + "num_tokens": 26360694.0, + "step": 722 + }, + { + "epoch": 0.13426183844011141, + "grad_norm": 1.772718071937561, + "learning_rate": 4.4678217821782177e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8434861898422241, + "num_tokens": 26392722.0, + "step": 723 + }, + { + "epoch": 0.13444753946146704, + "grad_norm": 1.6581882238388062, + "learning_rate": 4.474009900990099e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.832233726978302, + "num_tokens": 26428990.0, + "step": 724 + }, + { + "epoch": 0.13463324048282266, + "grad_norm": 1.5647152662277222, + "learning_rate": 4.48019801980198e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8400471210479736, + "num_tokens": 26467604.0, + "step": 725 + }, + { + "epoch": 0.13481894150417828, + "grad_norm": 1.5814077854156494, + "learning_rate": 4.486386138613861e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8425112962722778, + "num_tokens": 26503498.0, + "step": 726 + }, + { + "epoch": 0.1350046425255339, + "grad_norm": 1.5962685346603394, + "learning_rate": 4.4925742574257427e-07, + "loss": 0.5289, + "mean_token_accuracy": 0.8280892372131348, + "num_tokens": 26543274.0, + "step": 727 + }, + { + "epoch": 0.1351903435468895, + "grad_norm": 1.5168381929397583, + "learning_rate": 4.498762376237624e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.8416644930839539, + "num_tokens": 26583879.0, + "step": 728 + }, + { + "epoch": 0.13537604456824512, + "grad_norm": 1.5159642696380615, + "learning_rate": 4.504950495049505e-07, + "loss": 0.4353, + "mean_token_accuracy": 0.8573001623153687, + "num_tokens": 26621531.0, + "step": 729 + }, + { + "epoch": 0.13556174558960074, + "grad_norm": 1.6221660375595093, + "learning_rate": 4.511138613861386e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8359017372131348, + "num_tokens": 26657929.0, + "step": 730 + }, + { + "epoch": 0.13574744661095636, + "grad_norm": 1.6106047630310059, + "learning_rate": 4.517326732673267e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.8422811031341553, + "num_tokens": 26693067.0, + "step": 731 + }, + { + "epoch": 0.13593314763231198, + "grad_norm": 1.5944041013717651, + "learning_rate": 4.523514851485148e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8419498205184937, + "num_tokens": 26731473.0, + "step": 732 + }, + { + "epoch": 0.1361188486536676, + "grad_norm": 1.7349050045013428, + "learning_rate": 4.52970297029703e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8416719436645508, + "num_tokens": 26764706.0, + "step": 733 + }, + { + "epoch": 0.1363045496750232, + "grad_norm": 1.5989195108413696, + "learning_rate": 4.535891089108911e-07, + "loss": 0.4844, + "mean_token_accuracy": 0.8441289663314819, + "num_tokens": 26800283.0, + "step": 734 + }, + { + "epoch": 0.13649025069637882, + "grad_norm": 1.5435892343521118, + "learning_rate": 4.542079207920792e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8480713367462158, + "num_tokens": 26837747.0, + "step": 735 + }, + { + "epoch": 0.13667595171773445, + "grad_norm": 1.420343041419983, + "learning_rate": 4.548267326732673e-07, + "loss": 0.4218, + "mean_token_accuracy": 0.8590600490570068, + "num_tokens": 26876796.0, + "step": 736 + }, + { + "epoch": 0.13686165273909007, + "grad_norm": 1.593887209892273, + "learning_rate": 4.5544554455445543e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.8558186292648315, + "num_tokens": 26910844.0, + "step": 737 + }, + { + "epoch": 0.1370473537604457, + "grad_norm": 1.476017713546753, + "learning_rate": 4.5606435643564354e-07, + "loss": 0.4416, + "mean_token_accuracy": 0.8532381057739258, + "num_tokens": 26951243.0, + "step": 738 + }, + { + "epoch": 0.1372330547818013, + "grad_norm": 1.559935212135315, + "learning_rate": 4.5668316831683166e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8485026359558105, + "num_tokens": 26993105.0, + "step": 739 + }, + { + "epoch": 0.1374187558031569, + "grad_norm": 1.7094013690948486, + "learning_rate": 4.573019801980198e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8360997438430786, + "num_tokens": 27029646.0, + "step": 740 + }, + { + "epoch": 0.13760445682451253, + "grad_norm": 1.8174148797988892, + "learning_rate": 4.5792079207920793e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8463684320449829, + "num_tokens": 27058256.0, + "step": 741 + }, + { + "epoch": 0.13779015784586815, + "grad_norm": 1.6813061237335205, + "learning_rate": 4.5853960396039604e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8446034789085388, + "num_tokens": 27090921.0, + "step": 742 + }, + { + "epoch": 0.13797585886722377, + "grad_norm": 1.6892613172531128, + "learning_rate": 4.5915841584158415e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8483959436416626, + "num_tokens": 27125220.0, + "step": 743 + }, + { + "epoch": 0.1381615598885794, + "grad_norm": 1.5819740295410156, + "learning_rate": 4.5977722772277227e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.8547965288162231, + "num_tokens": 27163220.0, + "step": 744 + }, + { + "epoch": 0.13834726090993502, + "grad_norm": 1.6177067756652832, + "learning_rate": 4.603960396039604e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8289379477500916, + "num_tokens": 27202313.0, + "step": 745 + }, + { + "epoch": 0.1385329619312906, + "grad_norm": 1.6390671730041504, + "learning_rate": 4.6101485148514854e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.845136284828186, + "num_tokens": 27242109.0, + "step": 746 + }, + { + "epoch": 0.13871866295264623, + "grad_norm": 1.5675177574157715, + "learning_rate": 4.6163366336633665e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8384986519813538, + "num_tokens": 27282225.0, + "step": 747 + }, + { + "epoch": 0.13890436397400185, + "grad_norm": 1.6820926666259766, + "learning_rate": 4.6225247524752476e-07, + "loss": 0.4488, + "mean_token_accuracy": 0.8548405170440674, + "num_tokens": 27314869.0, + "step": 748 + }, + { + "epoch": 0.13909006499535748, + "grad_norm": 1.7528138160705566, + "learning_rate": 4.628712871287129e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.838326632976532, + "num_tokens": 27347349.0, + "step": 749 + }, + { + "epoch": 0.1392757660167131, + "grad_norm": 1.5175272226333618, + "learning_rate": 4.63490099009901e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8566344380378723, + "num_tokens": 27386247.0, + "step": 750 + }, + { + "epoch": 0.13946146703806872, + "grad_norm": 1.636618733406067, + "learning_rate": 4.641089108910891e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.8514664769172668, + "num_tokens": 27424400.0, + "step": 751 + }, + { + "epoch": 0.13964716805942431, + "grad_norm": 1.5758973360061646, + "learning_rate": 4.647277227722772e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.8504307270050049, + "num_tokens": 27461723.0, + "step": 752 + }, + { + "epoch": 0.13983286908077994, + "grad_norm": 1.6110308170318604, + "learning_rate": 4.6534653465346537e-07, + "loss": 0.448, + "mean_token_accuracy": 0.8523389101028442, + "num_tokens": 27499438.0, + "step": 753 + }, + { + "epoch": 0.14001857010213556, + "grad_norm": 1.686204195022583, + "learning_rate": 4.659653465346535e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8504970669746399, + "num_tokens": 27532167.0, + "step": 754 + }, + { + "epoch": 0.14020427112349118, + "grad_norm": 1.5805413722991943, + "learning_rate": 4.665841584158416e-07, + "loss": 0.5115, + "mean_token_accuracy": 0.8355611562728882, + "num_tokens": 27571226.0, + "step": 755 + }, + { + "epoch": 0.1403899721448468, + "grad_norm": 1.6311819553375244, + "learning_rate": 4.672029702970297e-07, + "loss": 0.448, + "mean_token_accuracy": 0.8531358242034912, + "num_tokens": 27607169.0, + "step": 756 + }, + { + "epoch": 0.14057567316620243, + "grad_norm": 1.7550959587097168, + "learning_rate": 4.678217821782178e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8380564451217651, + "num_tokens": 27638495.0, + "step": 757 + }, + { + "epoch": 0.14076137418755802, + "grad_norm": 1.5812292098999023, + "learning_rate": 4.6844059405940593e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.8336012363433838, + "num_tokens": 27676676.0, + "step": 758 + }, + { + "epoch": 0.14094707520891364, + "grad_norm": 1.6200908422470093, + "learning_rate": 4.69059405940594e-07, + "loss": 0.495, + "mean_token_accuracy": 0.8391515016555786, + "num_tokens": 27714154.0, + "step": 759 + }, + { + "epoch": 0.14113277623026926, + "grad_norm": 1.7253758907318115, + "learning_rate": 4.6967821782178215e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8444937467575073, + "num_tokens": 27749293.0, + "step": 760 + }, + { + "epoch": 0.14131847725162489, + "grad_norm": 1.6355372667312622, + "learning_rate": 4.7029702970297026e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.8553729057312012, + "num_tokens": 27783451.0, + "step": 761 + }, + { + "epoch": 0.1415041782729805, + "grad_norm": 1.6985266208648682, + "learning_rate": 4.709158415841584e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8517684936523438, + "num_tokens": 27817195.0, + "step": 762 + }, + { + "epoch": 0.14168987929433613, + "grad_norm": 1.6098090410232544, + "learning_rate": 4.715346534653465e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.8487061262130737, + "num_tokens": 27855405.0, + "step": 763 + }, + { + "epoch": 0.14187558031569172, + "grad_norm": 1.649333119392395, + "learning_rate": 4.721534653465346e-07, + "loss": 0.5597, + "mean_token_accuracy": 0.8201389908790588, + "num_tokens": 27896930.0, + "step": 764 + }, + { + "epoch": 0.14206128133704735, + "grad_norm": 1.7794227600097656, + "learning_rate": 4.727722772277227e-07, + "loss": 0.5037, + "mean_token_accuracy": 0.835436224937439, + "num_tokens": 27930840.0, + "step": 765 + }, + { + "epoch": 0.14224698235840297, + "grad_norm": 1.6709026098251343, + "learning_rate": 4.733910891089108e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8395886421203613, + "num_tokens": 27968892.0, + "step": 766 + }, + { + "epoch": 0.1424326833797586, + "grad_norm": 1.5895322561264038, + "learning_rate": 4.74009900990099e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8443618416786194, + "num_tokens": 28008513.0, + "step": 767 + }, + { + "epoch": 0.1426183844011142, + "grad_norm": 1.7229772806167603, + "learning_rate": 4.746287128712871e-07, + "loss": 0.506, + "mean_token_accuracy": 0.8361892104148865, + "num_tokens": 28044109.0, + "step": 768 + }, + { + "epoch": 0.14280408542246983, + "grad_norm": 1.6566991806030273, + "learning_rate": 4.752475247524752e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.8309531211853027, + "num_tokens": 28079151.0, + "step": 769 + }, + { + "epoch": 0.14298978644382543, + "grad_norm": 1.4762920141220093, + "learning_rate": 4.758663366336633e-07, + "loss": 0.442, + "mean_token_accuracy": 0.856386661529541, + "num_tokens": 28121850.0, + "step": 770 + }, + { + "epoch": 0.14317548746518105, + "grad_norm": 1.5260200500488281, + "learning_rate": 4.7648514851485143e-07, + "loss": 0.4389, + "mean_token_accuracy": 0.8591230511665344, + "num_tokens": 28160841.0, + "step": 771 + }, + { + "epoch": 0.14336118848653667, + "grad_norm": 1.5708967447280884, + "learning_rate": 4.771039603960396e-07, + "loss": 0.4159, + "mean_token_accuracy": 0.8601686954498291, + "num_tokens": 28197717.0, + "step": 772 + }, + { + "epoch": 0.1435468895078923, + "grad_norm": 1.622570514678955, + "learning_rate": 4.777227722772277e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8536767959594727, + "num_tokens": 28233893.0, + "step": 773 + }, + { + "epoch": 0.14373259052924792, + "grad_norm": 1.5867457389831543, + "learning_rate": 4.783415841584158e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8395296335220337, + "num_tokens": 28269758.0, + "step": 774 + }, + { + "epoch": 0.14391829155060354, + "grad_norm": 1.5454561710357666, + "learning_rate": 4.789603960396039e-07, + "loss": 0.4051, + "mean_token_accuracy": 0.8657976388931274, + "num_tokens": 28306443.0, + "step": 775 + }, + { + "epoch": 0.14410399257195913, + "grad_norm": 1.5753659009933472, + "learning_rate": 4.79579207920792e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8527190685272217, + "num_tokens": 28344622.0, + "step": 776 + }, + { + "epoch": 0.14428969359331476, + "grad_norm": 1.665644884109497, + "learning_rate": 4.801980198019802e-07, + "loss": 0.4665, + "mean_token_accuracy": 0.8471471071243286, + "num_tokens": 28381372.0, + "step": 777 + }, + { + "epoch": 0.14447539461467038, + "grad_norm": 1.6118084192276, + "learning_rate": 4.808168316831683e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8338513374328613, + "num_tokens": 28419868.0, + "step": 778 + }, + { + "epoch": 0.144661095636026, + "grad_norm": 1.6729671955108643, + "learning_rate": 4.814356435643564e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8344984650611877, + "num_tokens": 28456827.0, + "step": 779 + }, + { + "epoch": 0.14484679665738162, + "grad_norm": 1.5738716125488281, + "learning_rate": 4.820544554455445e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8349927663803101, + "num_tokens": 28496169.0, + "step": 780 + }, + { + "epoch": 0.14503249767873724, + "grad_norm": 1.7063076496124268, + "learning_rate": 4.826732673267326e-07, + "loss": 0.4629, + "mean_token_accuracy": 0.847977876663208, + "num_tokens": 28528701.0, + "step": 781 + }, + { + "epoch": 0.14521819870009284, + "grad_norm": 1.572239875793457, + "learning_rate": 4.832920792079207e-07, + "loss": 0.5028, + "mean_token_accuracy": 0.8347068428993225, + "num_tokens": 28566862.0, + "step": 782 + }, + { + "epoch": 0.14540389972144846, + "grad_norm": 1.5289136171340942, + "learning_rate": 4.839108910891089e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.844834566116333, + "num_tokens": 28608283.0, + "step": 783 + }, + { + "epoch": 0.14558960074280408, + "grad_norm": 1.9364404678344727, + "learning_rate": 4.84529702970297e-07, + "loss": 0.5268, + "mean_token_accuracy": 0.8246715664863586, + "num_tokens": 28638872.0, + "step": 784 + }, + { + "epoch": 0.1457753017641597, + "grad_norm": 1.7547245025634766, + "learning_rate": 4.851485148514851e-07, + "loss": 0.5178, + "mean_token_accuracy": 0.8326768279075623, + "num_tokens": 28672934.0, + "step": 785 + }, + { + "epoch": 0.14596100278551533, + "grad_norm": 1.528746247291565, + "learning_rate": 4.857673267326733e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8474951386451721, + "num_tokens": 28711232.0, + "step": 786 + }, + { + "epoch": 0.14614670380687095, + "grad_norm": 1.512567400932312, + "learning_rate": 4.863861386138613e-07, + "loss": 0.4371, + "mean_token_accuracy": 0.856112539768219, + "num_tokens": 28748083.0, + "step": 787 + }, + { + "epoch": 0.14633240482822654, + "grad_norm": 1.5965261459350586, + "learning_rate": 4.870049504950495e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8527596592903137, + "num_tokens": 28782799.0, + "step": 788 + }, + { + "epoch": 0.14651810584958216, + "grad_norm": 1.7550617456436157, + "learning_rate": 4.876237623762375e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8432385325431824, + "num_tokens": 28814847.0, + "step": 789 + }, + { + "epoch": 0.1467038068709378, + "grad_norm": 1.494938850402832, + "learning_rate": 4.882425742574257e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8467056155204773, + "num_tokens": 28853879.0, + "step": 790 + }, + { + "epoch": 0.1468895078922934, + "grad_norm": 1.6110467910766602, + "learning_rate": 4.888613861386139e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.8420159816741943, + "num_tokens": 28891053.0, + "step": 791 + }, + { + "epoch": 0.14707520891364903, + "grad_norm": 1.7752528190612793, + "learning_rate": 4.894801980198019e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8465792536735535, + "num_tokens": 28924616.0, + "step": 792 + }, + { + "epoch": 0.14726090993500465, + "grad_norm": 1.5198466777801514, + "learning_rate": 4.900990099009901e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8473249077796936, + "num_tokens": 28963860.0, + "step": 793 + }, + { + "epoch": 0.14744661095636025, + "grad_norm": 1.972707748413086, + "learning_rate": 4.907178217821781e-07, + "loss": 0.5273, + "mean_token_accuracy": 0.8225167989730835, + "num_tokens": 28990634.0, + "step": 794 + }, + { + "epoch": 0.14763231197771587, + "grad_norm": 1.6760627031326294, + "learning_rate": 4.913366336633663e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8474137783050537, + "num_tokens": 29026986.0, + "step": 795 + }, + { + "epoch": 0.1478180129990715, + "grad_norm": 1.8003957271575928, + "learning_rate": 4.919554455445545e-07, + "loss": 0.4708, + "mean_token_accuracy": 0.844618022441864, + "num_tokens": 29057983.0, + "step": 796 + }, + { + "epoch": 0.1480037140204271, + "grad_norm": 1.64003324508667, + "learning_rate": 4.925742574257425e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8292741179466248, + "num_tokens": 29096423.0, + "step": 797 + }, + { + "epoch": 0.14818941504178273, + "grad_norm": 1.5076452493667603, + "learning_rate": 4.931930693069307e-07, + "loss": 0.4563, + "mean_token_accuracy": 0.8505017757415771, + "num_tokens": 29133904.0, + "step": 798 + }, + { + "epoch": 0.14837511606313836, + "grad_norm": 1.552809715270996, + "learning_rate": 4.938118811881188e-07, + "loss": 0.4422, + "mean_token_accuracy": 0.8520506024360657, + "num_tokens": 29169293.0, + "step": 799 + }, + { + "epoch": 0.14856081708449395, + "grad_norm": 1.5078601837158203, + "learning_rate": 4.944306930693069e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8464140295982361, + "num_tokens": 29211853.0, + "step": 800 + }, + { + "epoch": 0.14874651810584957, + "grad_norm": 1.6883293390274048, + "learning_rate": 4.95049504950495e-07, + "loss": 0.519, + "mean_token_accuracy": 0.8346641063690186, + "num_tokens": 29250454.0, + "step": 801 + }, + { + "epoch": 0.1489322191272052, + "grad_norm": 1.6073412895202637, + "learning_rate": 4.956683168316831e-07, + "loss": 0.4401, + "mean_token_accuracy": 0.8564658164978027, + "num_tokens": 29286126.0, + "step": 802 + }, + { + "epoch": 0.14911792014856082, + "grad_norm": 1.6400083303451538, + "learning_rate": 4.962871287128713e-07, + "loss": 0.4226, + "mean_token_accuracy": 0.8600130677223206, + "num_tokens": 29322555.0, + "step": 803 + }, + { + "epoch": 0.14930362116991644, + "grad_norm": 1.4847689867019653, + "learning_rate": 4.969059405940594e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.8503918051719666, + "num_tokens": 29361573.0, + "step": 804 + }, + { + "epoch": 0.14948932219127206, + "grad_norm": 1.5911247730255127, + "learning_rate": 4.975247524752475e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.8450283408164978, + "num_tokens": 29395412.0, + "step": 805 + }, + { + "epoch": 0.14967502321262766, + "grad_norm": 1.701493740081787, + "learning_rate": 4.981435643564356e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8452171087265015, + "num_tokens": 29428316.0, + "step": 806 + }, + { + "epoch": 0.14986072423398328, + "grad_norm": 1.7784218788146973, + "learning_rate": 4.987623762376238e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8363494277000427, + "num_tokens": 29463746.0, + "step": 807 + }, + { + "epoch": 0.1500464252553389, + "grad_norm": 1.734844446182251, + "learning_rate": 4.993811881188118e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8382062315940857, + "num_tokens": 29496231.0, + "step": 808 + }, + { + "epoch": 0.15023212627669452, + "grad_norm": 1.465118169784546, + "learning_rate": 5e-07, + "loss": 0.4262, + "mean_token_accuracy": 0.8578707575798035, + "num_tokens": 29535913.0, + "step": 809 + }, + { + "epoch": 0.15041782729805014, + "grad_norm": 1.505316972732544, + "learning_rate": 5.00618811881188e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8511964678764343, + "num_tokens": 29577167.0, + "step": 810 + }, + { + "epoch": 0.15060352831940577, + "grad_norm": 1.5291879177093506, + "learning_rate": 5.012376237623762e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8366577625274658, + "num_tokens": 29618743.0, + "step": 811 + }, + { + "epoch": 0.15078922934076136, + "grad_norm": 1.6160011291503906, + "learning_rate": 5.018564356435643e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.8339696526527405, + "num_tokens": 29658201.0, + "step": 812 + }, + { + "epoch": 0.15097493036211698, + "grad_norm": 1.4742094278335571, + "learning_rate": 5.024752475247524e-07, + "loss": 0.4769, + "mean_token_accuracy": 0.8451594114303589, + "num_tokens": 29700362.0, + "step": 813 + }, + { + "epoch": 0.1511606313834726, + "grad_norm": 1.6103081703186035, + "learning_rate": 5.030940594059405e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8458762168884277, + "num_tokens": 29737618.0, + "step": 814 + }, + { + "epoch": 0.15134633240482823, + "grad_norm": 1.6842072010040283, + "learning_rate": 5.037128712871286e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.8332959413528442, + "num_tokens": 29772731.0, + "step": 815 + }, + { + "epoch": 0.15153203342618385, + "grad_norm": 1.6391756534576416, + "learning_rate": 5.043316831683168e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8428400754928589, + "num_tokens": 29810111.0, + "step": 816 + }, + { + "epoch": 0.15171773444753947, + "grad_norm": 1.503485083580017, + "learning_rate": 5.04950495049505e-07, + "loss": 0.5369, + "mean_token_accuracy": 0.8226590752601624, + "num_tokens": 29856321.0, + "step": 817 + }, + { + "epoch": 0.1519034354688951, + "grad_norm": 1.54231595993042, + "learning_rate": 5.05569306930693e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.855392575263977, + "num_tokens": 29892301.0, + "step": 818 + }, + { + "epoch": 0.1520891364902507, + "grad_norm": 1.640402913093567, + "learning_rate": 5.061881188118812e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8459517955780029, + "num_tokens": 29930955.0, + "step": 819 + }, + { + "epoch": 0.1522748375116063, + "grad_norm": 1.5836118459701538, + "learning_rate": 5.068069306930693e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8362855911254883, + "num_tokens": 29968568.0, + "step": 820 + }, + { + "epoch": 0.15246053853296193, + "grad_norm": 1.554067850112915, + "learning_rate": 5.074257425742574e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8378332853317261, + "num_tokens": 30007133.0, + "step": 821 + }, + { + "epoch": 0.15264623955431755, + "grad_norm": 1.6691982746124268, + "learning_rate": 5.080445544554455e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8514504432678223, + "num_tokens": 30045159.0, + "step": 822 + }, + { + "epoch": 0.15283194057567318, + "grad_norm": 1.9182658195495605, + "learning_rate": 5.086633663366336e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8448576331138611, + "num_tokens": 30076405.0, + "step": 823 + }, + { + "epoch": 0.1530176415970288, + "grad_norm": 1.6954401731491089, + "learning_rate": 5.092821782178217e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8433636426925659, + "num_tokens": 30112592.0, + "step": 824 + }, + { + "epoch": 0.1532033426183844, + "grad_norm": 1.6281622648239136, + "learning_rate": 5.099009900990099e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8421867489814758, + "num_tokens": 30150933.0, + "step": 825 + }, + { + "epoch": 0.15338904363974, + "grad_norm": 1.5713016986846924, + "learning_rate": 5.105198019801979e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.846615731716156, + "num_tokens": 30188868.0, + "step": 826 + }, + { + "epoch": 0.15357474466109564, + "grad_norm": 1.6477911472320557, + "learning_rate": 5.111386138613861e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.8491777777671814, + "num_tokens": 30222973.0, + "step": 827 + }, + { + "epoch": 0.15376044568245126, + "grad_norm": 1.5839323997497559, + "learning_rate": 5.117574257425741e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8436392545700073, + "num_tokens": 30262166.0, + "step": 828 + }, + { + "epoch": 0.15394614670380688, + "grad_norm": 1.675723671913147, + "learning_rate": 5.123762376237624e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8435362577438354, + "num_tokens": 30297705.0, + "step": 829 + }, + { + "epoch": 0.1541318477251625, + "grad_norm": 1.632258653640747, + "learning_rate": 5.129950495049505e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8328487873077393, + "num_tokens": 30339704.0, + "step": 830 + }, + { + "epoch": 0.1543175487465181, + "grad_norm": 1.6507319211959839, + "learning_rate": 5.136138613861386e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.8386744260787964, + "num_tokens": 30374218.0, + "step": 831 + }, + { + "epoch": 0.15450324976787372, + "grad_norm": 1.6434139013290405, + "learning_rate": 5.142326732673267e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8379985094070435, + "num_tokens": 30409810.0, + "step": 832 + }, + { + "epoch": 0.15468895078922934, + "grad_norm": 1.593735694885254, + "learning_rate": 5.148514851485149e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8500747680664062, + "num_tokens": 30446491.0, + "step": 833 + }, + { + "epoch": 0.15487465181058496, + "grad_norm": 1.9633800983428955, + "learning_rate": 5.154702970297029e-07, + "loss": 0.4232, + "mean_token_accuracy": 0.8531507849693298, + "num_tokens": 30473369.0, + "step": 834 + }, + { + "epoch": 0.15506035283194058, + "grad_norm": 1.6766947507858276, + "learning_rate": 5.160891089108911e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8430466651916504, + "num_tokens": 30508085.0, + "step": 835 + }, + { + "epoch": 0.1552460538532962, + "grad_norm": 1.6234374046325684, + "learning_rate": 5.167079207920791e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8347383141517639, + "num_tokens": 30544738.0, + "step": 836 + }, + { + "epoch": 0.1554317548746518, + "grad_norm": 1.7085098028182983, + "learning_rate": 5.173267326732673e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8541035652160645, + "num_tokens": 30577186.0, + "step": 837 + }, + { + "epoch": 0.15561745589600742, + "grad_norm": 1.540765643119812, + "learning_rate": 5.179455445544554e-07, + "loss": 0.5018, + "mean_token_accuracy": 0.8356806039810181, + "num_tokens": 30617625.0, + "step": 838 + }, + { + "epoch": 0.15580315691736304, + "grad_norm": 1.6730341911315918, + "learning_rate": 5.185643564356435e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.8336160778999329, + "num_tokens": 30660389.0, + "step": 839 + }, + { + "epoch": 0.15598885793871867, + "grad_norm": 1.4499859809875488, + "learning_rate": 5.191831683168316e-07, + "loss": 0.4011, + "mean_token_accuracy": 0.8676704168319702, + "num_tokens": 30698046.0, + "step": 840 + }, + { + "epoch": 0.1561745589600743, + "grad_norm": 1.697485089302063, + "learning_rate": 5.198019801980198e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.8433350920677185, + "num_tokens": 30732626.0, + "step": 841 + }, + { + "epoch": 0.1563602599814299, + "grad_norm": 1.601701259613037, + "learning_rate": 5.204207920792078e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8381365537643433, + "num_tokens": 30773476.0, + "step": 842 + }, + { + "epoch": 0.1565459610027855, + "grad_norm": 1.6326549053192139, + "learning_rate": 5.210396039603961e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.848376989364624, + "num_tokens": 30811634.0, + "step": 843 + }, + { + "epoch": 0.15673166202414113, + "grad_norm": 1.5429154634475708, + "learning_rate": 5.216584158415841e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.847266674041748, + "num_tokens": 30846654.0, + "step": 844 + }, + { + "epoch": 0.15691736304549675, + "grad_norm": 1.747117280960083, + "learning_rate": 5.222772277227723e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8472579717636108, + "num_tokens": 30875864.0, + "step": 845 + }, + { + "epoch": 0.15710306406685237, + "grad_norm": 1.548645257949829, + "learning_rate": 5.228960396039604e-07, + "loss": 0.4215, + "mean_token_accuracy": 0.8572754263877869, + "num_tokens": 30912365.0, + "step": 846 + }, + { + "epoch": 0.157288765088208, + "grad_norm": 1.6740041971206665, + "learning_rate": 5.235148514851485e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8485215902328491, + "num_tokens": 30947388.0, + "step": 847 + }, + { + "epoch": 0.15747446610956362, + "grad_norm": 1.5700911283493042, + "learning_rate": 5.241336633663366e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.8471469879150391, + "num_tokens": 30984941.0, + "step": 848 + }, + { + "epoch": 0.1576601671309192, + "grad_norm": 1.483809232711792, + "learning_rate": 5.247524752475247e-07, + "loss": 0.4004, + "mean_token_accuracy": 0.8640611171722412, + "num_tokens": 31024552.0, + "step": 849 + }, + { + "epoch": 0.15784586815227483, + "grad_norm": 1.6189672946929932, + "learning_rate": 5.253712871287128e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8410048484802246, + "num_tokens": 31060548.0, + "step": 850 + }, + { + "epoch": 0.15803156917363045, + "grad_norm": 1.5296826362609863, + "learning_rate": 5.25990099009901e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8399536609649658, + "num_tokens": 31100171.0, + "step": 851 + }, + { + "epoch": 0.15821727019498608, + "grad_norm": 1.526991605758667, + "learning_rate": 5.26608910891089e-07, + "loss": 0.3595, + "mean_token_accuracy": 0.8790343999862671, + "num_tokens": 31132444.0, + "step": 852 + }, + { + "epoch": 0.1584029712163417, + "grad_norm": 1.7948460578918457, + "learning_rate": 5.272277227722772e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8366115093231201, + "num_tokens": 31162853.0, + "step": 853 + }, + { + "epoch": 0.15858867223769732, + "grad_norm": 1.793414831161499, + "learning_rate": 5.278465346534653e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8410225510597229, + "num_tokens": 31191937.0, + "step": 854 + }, + { + "epoch": 0.15877437325905291, + "grad_norm": 1.573181390762329, + "learning_rate": 5.284653465346534e-07, + "loss": 0.4867, + "mean_token_accuracy": 0.8405591249465942, + "num_tokens": 31228683.0, + "step": 855 + }, + { + "epoch": 0.15896007428040854, + "grad_norm": 1.6615502834320068, + "learning_rate": 5.290841584158416e-07, + "loss": 0.441, + "mean_token_accuracy": 0.848514199256897, + "num_tokens": 31263135.0, + "step": 856 + }, + { + "epoch": 0.15914577530176416, + "grad_norm": 1.4788811206817627, + "learning_rate": 5.297029702970297e-07, + "loss": 0.4212, + "mean_token_accuracy": 0.8571362495422363, + "num_tokens": 31301211.0, + "step": 857 + }, + { + "epoch": 0.15933147632311978, + "grad_norm": 1.622282862663269, + "learning_rate": 5.303217821782178e-07, + "loss": 0.4219, + "mean_token_accuracy": 0.8620373010635376, + "num_tokens": 31334025.0, + "step": 858 + }, + { + "epoch": 0.1595171773444754, + "grad_norm": 1.509793996810913, + "learning_rate": 5.30940594059406e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8564107418060303, + "num_tokens": 31370718.0, + "step": 859 + }, + { + "epoch": 0.15970287836583102, + "grad_norm": 1.6455167531967163, + "learning_rate": 5.31559405940594e-07, + "loss": 0.4675, + "mean_token_accuracy": 0.8471344113349915, + "num_tokens": 31404816.0, + "step": 860 + }, + { + "epoch": 0.15988857938718662, + "grad_norm": 1.5681519508361816, + "learning_rate": 5.321782178217822e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.8398244976997375, + "num_tokens": 31446604.0, + "step": 861 + }, + { + "epoch": 0.16007428040854224, + "grad_norm": 1.6602518558502197, + "learning_rate": 5.327970297029702e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8429582715034485, + "num_tokens": 31483089.0, + "step": 862 + }, + { + "epoch": 0.16025998142989786, + "grad_norm": 1.625367522239685, + "learning_rate": 5.334158415841584e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8431071043014526, + "num_tokens": 31519727.0, + "step": 863 + }, + { + "epoch": 0.16044568245125349, + "grad_norm": 1.7147382497787476, + "learning_rate": 5.340346534653465e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8332244157791138, + "num_tokens": 31556168.0, + "step": 864 + }, + { + "epoch": 0.1606313834726091, + "grad_norm": 1.7369199991226196, + "learning_rate": 5.346534653465346e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8452661037445068, + "num_tokens": 31590925.0, + "step": 865 + }, + { + "epoch": 0.16081708449396473, + "grad_norm": 1.5900683403015137, + "learning_rate": 5.352722772277227e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.8554790019989014, + "num_tokens": 31627263.0, + "step": 866 + }, + { + "epoch": 0.16100278551532032, + "grad_norm": 1.5862773656845093, + "learning_rate": 5.358910891089109e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8377336263656616, + "num_tokens": 31665575.0, + "step": 867 + }, + { + "epoch": 0.16118848653667595, + "grad_norm": 1.585006833076477, + "learning_rate": 5.365099009900989e-07, + "loss": 0.4705, + "mean_token_accuracy": 0.8429903388023376, + "num_tokens": 31702913.0, + "step": 868 + }, + { + "epoch": 0.16137418755803157, + "grad_norm": 1.7441085577011108, + "learning_rate": 5.371287128712872e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8277394771575928, + "num_tokens": 31735896.0, + "step": 869 + }, + { + "epoch": 0.1615598885793872, + "grad_norm": 1.5021343231201172, + "learning_rate": 5.377475247524752e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8444584608078003, + "num_tokens": 31775669.0, + "step": 870 + }, + { + "epoch": 0.1617455896007428, + "grad_norm": 1.5068442821502686, + "learning_rate": 5.383663366336634e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8449519872665405, + "num_tokens": 31816522.0, + "step": 871 + }, + { + "epoch": 0.16193129062209843, + "grad_norm": 1.8749653100967407, + "learning_rate": 5.389851485148515e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8474222421646118, + "num_tokens": 31849944.0, + "step": 872 + }, + { + "epoch": 0.16211699164345403, + "grad_norm": 1.5083227157592773, + "learning_rate": 5.396039603960396e-07, + "loss": 0.4675, + "mean_token_accuracy": 0.8440806865692139, + "num_tokens": 31888459.0, + "step": 873 + }, + { + "epoch": 0.16230269266480965, + "grad_norm": 1.6660667657852173, + "learning_rate": 5.402227722772277e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8478038907051086, + "num_tokens": 31924467.0, + "step": 874 + }, + { + "epoch": 0.16248839368616527, + "grad_norm": 1.635441780090332, + "learning_rate": 5.408415841584159e-07, + "loss": 0.5323, + "mean_token_accuracy": 0.8291878700256348, + "num_tokens": 31962582.0, + "step": 875 + }, + { + "epoch": 0.1626740947075209, + "grad_norm": 1.7389039993286133, + "learning_rate": 5.414603960396039e-07, + "loss": 0.5524, + "mean_token_accuracy": 0.8186454772949219, + "num_tokens": 32000811.0, + "step": 876 + }, + { + "epoch": 0.16285979572887652, + "grad_norm": 1.7209506034851074, + "learning_rate": 5.420792079207921e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.842786431312561, + "num_tokens": 32031524.0, + "step": 877 + }, + { + "epoch": 0.16304549675023214, + "grad_norm": 1.5446337461471558, + "learning_rate": 5.426980198019801e-07, + "loss": 0.4178, + "mean_token_accuracy": 0.8613494634628296, + "num_tokens": 32067321.0, + "step": 878 + }, + { + "epoch": 0.16323119777158773, + "grad_norm": 1.4843471050262451, + "learning_rate": 5.433168316831683e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.852041482925415, + "num_tokens": 32111029.0, + "step": 879 + }, + { + "epoch": 0.16341689879294335, + "grad_norm": 1.7082107067108154, + "learning_rate": 5.439356435643564e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.845041036605835, + "num_tokens": 32143082.0, + "step": 880 + }, + { + "epoch": 0.16360259981429898, + "grad_norm": 1.5263217687606812, + "learning_rate": 5.445544554455445e-07, + "loss": 0.5169, + "mean_token_accuracy": 0.830458402633667, + "num_tokens": 32185176.0, + "step": 881 + }, + { + "epoch": 0.1637883008356546, + "grad_norm": 1.5918487310409546, + "learning_rate": 5.451732673267327e-07, + "loss": 0.446, + "mean_token_accuracy": 0.8507110476493835, + "num_tokens": 32220714.0, + "step": 882 + }, + { + "epoch": 0.16397400185701022, + "grad_norm": 1.6992636919021606, + "learning_rate": 5.457920792079208e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.849542498588562, + "num_tokens": 32259406.0, + "step": 883 + }, + { + "epoch": 0.16415970287836584, + "grad_norm": 1.5698693990707397, + "learning_rate": 5.464108910891089e-07, + "loss": 0.46, + "mean_token_accuracy": 0.8460401892662048, + "num_tokens": 32297068.0, + "step": 884 + }, + { + "epoch": 0.16434540389972144, + "grad_norm": 1.5181801319122314, + "learning_rate": 5.470297029702971e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8323219418525696, + "num_tokens": 32341357.0, + "step": 885 + }, + { + "epoch": 0.16453110492107706, + "grad_norm": 1.5900275707244873, + "learning_rate": 5.476485148514851e-07, + "loss": 0.456, + "mean_token_accuracy": 0.8530348539352417, + "num_tokens": 32378356.0, + "step": 886 + }, + { + "epoch": 0.16471680594243268, + "grad_norm": 1.5041037797927856, + "learning_rate": 5.482673267326733e-07, + "loss": 0.4204, + "mean_token_accuracy": 0.8585824966430664, + "num_tokens": 32418462.0, + "step": 887 + }, + { + "epoch": 0.1649025069637883, + "grad_norm": 1.6462653875350952, + "learning_rate": 5.488861386138614e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8361207842826843, + "num_tokens": 32452772.0, + "step": 888 + }, + { + "epoch": 0.16508820798514393, + "grad_norm": 1.7006431818008423, + "learning_rate": 5.495049504950495e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8479852676391602, + "num_tokens": 32484995.0, + "step": 889 + }, + { + "epoch": 0.16527390900649955, + "grad_norm": 1.647732138633728, + "learning_rate": 5.501237623762376e-07, + "loss": 0.4569, + "mean_token_accuracy": 0.8515307307243347, + "num_tokens": 32520406.0, + "step": 890 + }, + { + "epoch": 0.16545961002785514, + "grad_norm": 1.840625524520874, + "learning_rate": 5.507425742574257e-07, + "loss": 0.5113, + "mean_token_accuracy": 0.8318971395492554, + "num_tokens": 32551984.0, + "step": 891 + }, + { + "epoch": 0.16564531104921076, + "grad_norm": 1.7001025676727295, + "learning_rate": 5.513613861386138e-07, + "loss": 0.4103, + "mean_token_accuracy": 0.8614272475242615, + "num_tokens": 32583385.0, + "step": 892 + }, + { + "epoch": 0.16583101207056639, + "grad_norm": 1.72334885597229, + "learning_rate": 5.51980198019802e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8367117047309875, + "num_tokens": 32615136.0, + "step": 893 + }, + { + "epoch": 0.166016713091922, + "grad_norm": 1.7407320737838745, + "learning_rate": 5.5259900990099e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8517706394195557, + "num_tokens": 32646705.0, + "step": 894 + }, + { + "epoch": 0.16620241411327763, + "grad_norm": 1.593166470527649, + "learning_rate": 5.532178217821783e-07, + "loss": 0.45, + "mean_token_accuracy": 0.8505730032920837, + "num_tokens": 32681407.0, + "step": 895 + }, + { + "epoch": 0.16638811513463325, + "grad_norm": 1.6682310104370117, + "learning_rate": 5.538366336633663e-07, + "loss": 0.4687, + "mean_token_accuracy": 0.8453264236450195, + "num_tokens": 32715638.0, + "step": 896 + }, + { + "epoch": 0.16657381615598885, + "grad_norm": 1.728474497795105, + "learning_rate": 5.544554455445545e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8540713787078857, + "num_tokens": 32747871.0, + "step": 897 + }, + { + "epoch": 0.16675951717734447, + "grad_norm": 1.51577627658844, + "learning_rate": 5.550742574257426e-07, + "loss": 0.3953, + "mean_token_accuracy": 0.8673598766326904, + "num_tokens": 32784851.0, + "step": 898 + }, + { + "epoch": 0.1669452181987001, + "grad_norm": 1.7420960664749146, + "learning_rate": 5.556930693069307e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8532273769378662, + "num_tokens": 32820819.0, + "step": 899 + }, + { + "epoch": 0.1671309192200557, + "grad_norm": 1.5119882822036743, + "learning_rate": 5.563118811881188e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.8328556418418884, + "num_tokens": 32864507.0, + "step": 900 + }, + { + "epoch": 0.16731662024141133, + "grad_norm": 1.560328722000122, + "learning_rate": 5.56930693069307e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8463727831840515, + "num_tokens": 32903409.0, + "step": 901 + }, + { + "epoch": 0.16750232126276696, + "grad_norm": 1.7994881868362427, + "learning_rate": 5.57549504950495e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8367104530334473, + "num_tokens": 32934903.0, + "step": 902 + }, + { + "epoch": 0.16768802228412255, + "grad_norm": 1.658259391784668, + "learning_rate": 5.581683168316832e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8355941772460938, + "num_tokens": 32974115.0, + "step": 903 + }, + { + "epoch": 0.16787372330547817, + "grad_norm": 1.5620145797729492, + "learning_rate": 5.587871287128712e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8514292240142822, + "num_tokens": 33012826.0, + "step": 904 + }, + { + "epoch": 0.1680594243268338, + "grad_norm": 1.730107307434082, + "learning_rate": 5.594059405940594e-07, + "loss": 0.4503, + "mean_token_accuracy": 0.8523315191268921, + "num_tokens": 33047251.0, + "step": 905 + }, + { + "epoch": 0.16824512534818942, + "grad_norm": 1.4012742042541504, + "learning_rate": 5.600247524752475e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8390698432922363, + "num_tokens": 33090685.0, + "step": 906 + }, + { + "epoch": 0.16843082636954504, + "grad_norm": 1.6063182353973389, + "learning_rate": 5.606435643564356e-07, + "loss": 0.3815, + "mean_token_accuracy": 0.872943103313446, + "num_tokens": 33123694.0, + "step": 907 + }, + { + "epoch": 0.16861652739090066, + "grad_norm": 1.521969199180603, + "learning_rate": 5.612623762376237e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8580222129821777, + "num_tokens": 33161262.0, + "step": 908 + }, + { + "epoch": 0.16880222841225626, + "grad_norm": 1.7449108362197876, + "learning_rate": 5.61881188118812e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.8497001528739929, + "num_tokens": 33190079.0, + "step": 909 + }, + { + "epoch": 0.16898792943361188, + "grad_norm": 1.6736135482788086, + "learning_rate": 5.625e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8475731611251831, + "num_tokens": 33224007.0, + "step": 910 + }, + { + "epoch": 0.1691736304549675, + "grad_norm": 1.533035159111023, + "learning_rate": 5.631188118811881e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8508968353271484, + "num_tokens": 33263993.0, + "step": 911 + }, + { + "epoch": 0.16935933147632312, + "grad_norm": 1.5435274839401245, + "learning_rate": 5.637376237623762e-07, + "loss": 0.4194, + "mean_token_accuracy": 0.8552852869033813, + "num_tokens": 33303341.0, + "step": 912 + }, + { + "epoch": 0.16954503249767874, + "grad_norm": 1.499610185623169, + "learning_rate": 5.643564356435643e-07, + "loss": 0.4091, + "mean_token_accuracy": 0.8619606494903564, + "num_tokens": 33341260.0, + "step": 913 + }, + { + "epoch": 0.16973073351903437, + "grad_norm": 1.6160900592803955, + "learning_rate": 5.649752475247525e-07, + "loss": 0.4429, + "mean_token_accuracy": 0.8531498908996582, + "num_tokens": 33376070.0, + "step": 914 + }, + { + "epoch": 0.16991643454038996, + "grad_norm": 1.6403629779815674, + "learning_rate": 5.655940594059405e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.8391693830490112, + "num_tokens": 33416776.0, + "step": 915 + }, + { + "epoch": 0.17010213556174558, + "grad_norm": 1.7389230728149414, + "learning_rate": 5.662128712871287e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.834707498550415, + "num_tokens": 33453355.0, + "step": 916 + }, + { + "epoch": 0.1702878365831012, + "grad_norm": 1.7395223379135132, + "learning_rate": 5.668316831683167e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8358569741249084, + "num_tokens": 33485278.0, + "step": 917 + }, + { + "epoch": 0.17047353760445683, + "grad_norm": 1.567034125328064, + "learning_rate": 5.674504950495049e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8569706082344055, + "num_tokens": 33520789.0, + "step": 918 + }, + { + "epoch": 0.17065923862581245, + "grad_norm": 1.683243989944458, + "learning_rate": 5.68069306930693e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8374667167663574, + "num_tokens": 33555483.0, + "step": 919 + }, + { + "epoch": 0.17084493964716807, + "grad_norm": 1.5023351907730103, + "learning_rate": 5.686881188118811e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8558803796768188, + "num_tokens": 33593820.0, + "step": 920 + }, + { + "epoch": 0.17103064066852366, + "grad_norm": 1.7058364152908325, + "learning_rate": 5.693069306930692e-07, + "loss": 0.5557, + "mean_token_accuracy": 0.8189882040023804, + "num_tokens": 33632308.0, + "step": 921 + }, + { + "epoch": 0.1712163416898793, + "grad_norm": 1.5693540573120117, + "learning_rate": 5.699257425742575e-07, + "loss": 0.4073, + "mean_token_accuracy": 0.8666269183158875, + "num_tokens": 33666423.0, + "step": 922 + }, + { + "epoch": 0.1714020427112349, + "grad_norm": 1.545644998550415, + "learning_rate": 5.705445544554455e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8465567827224731, + "num_tokens": 33705501.0, + "step": 923 + }, + { + "epoch": 0.17158774373259053, + "grad_norm": 1.5621082782745361, + "learning_rate": 5.711633663366337e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8457387089729309, + "num_tokens": 33744694.0, + "step": 924 + }, + { + "epoch": 0.17177344475394615, + "grad_norm": 1.5768988132476807, + "learning_rate": 5.717821782178217e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8444480299949646, + "num_tokens": 33782892.0, + "step": 925 + }, + { + "epoch": 0.17195914577530177, + "grad_norm": 1.5221792459487915, + "learning_rate": 5.724009900990099e-07, + "loss": 0.4101, + "mean_token_accuracy": 0.8616306781768799, + "num_tokens": 33821247.0, + "step": 926 + }, + { + "epoch": 0.17214484679665737, + "grad_norm": 1.519838571548462, + "learning_rate": 5.73019801980198e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.8381850123405457, + "num_tokens": 33861097.0, + "step": 927 + }, + { + "epoch": 0.172330547818013, + "grad_norm": 1.6808396577835083, + "learning_rate": 5.736386138613861e-07, + "loss": 0.43, + "mean_token_accuracy": 0.854449987411499, + "num_tokens": 33895552.0, + "step": 928 + }, + { + "epoch": 0.1725162488393686, + "grad_norm": 1.516728162765503, + "learning_rate": 5.742574257425742e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8468372821807861, + "num_tokens": 33933739.0, + "step": 929 + }, + { + "epoch": 0.17270194986072424, + "grad_norm": 1.5650852918624878, + "learning_rate": 5.748762376237623e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.854073703289032, + "num_tokens": 33969679.0, + "step": 930 + }, + { + "epoch": 0.17288765088207986, + "grad_norm": 1.5124006271362305, + "learning_rate": 5.754950495049504e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8455975651741028, + "num_tokens": 34008924.0, + "step": 931 + }, + { + "epoch": 0.17307335190343548, + "grad_norm": 1.4332078695297241, + "learning_rate": 5.761138613861386e-07, + "loss": 0.4134, + "mean_token_accuracy": 0.863822340965271, + "num_tokens": 34048475.0, + "step": 932 + }, + { + "epoch": 0.17325905292479107, + "grad_norm": 1.5799415111541748, + "learning_rate": 5.767326732673266e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.8455710411071777, + "num_tokens": 34085870.0, + "step": 933 + }, + { + "epoch": 0.1734447539461467, + "grad_norm": 1.640256404876709, + "learning_rate": 5.773514851485148e-07, + "loss": 0.4153, + "mean_token_accuracy": 0.8621177673339844, + "num_tokens": 34119540.0, + "step": 934 + }, + { + "epoch": 0.17363045496750232, + "grad_norm": 1.6562540531158447, + "learning_rate": 5.779702970297029e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8430981636047363, + "num_tokens": 34154391.0, + "step": 935 + }, + { + "epoch": 0.17381615598885794, + "grad_norm": 1.4432100057601929, + "learning_rate": 5.785891089108911e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8510259985923767, + "num_tokens": 34196669.0, + "step": 936 + }, + { + "epoch": 0.17400185701021356, + "grad_norm": 1.7722922563552856, + "learning_rate": 5.792079207920792e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.8459569811820984, + "num_tokens": 34229590.0, + "step": 937 + }, + { + "epoch": 0.17418755803156918, + "grad_norm": 1.7181165218353271, + "learning_rate": 5.798267326732673e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8550892472267151, + "num_tokens": 34259382.0, + "step": 938 + }, + { + "epoch": 0.17437325905292478, + "grad_norm": 1.4889718294143677, + "learning_rate": 5.804455445544554e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8463097810745239, + "num_tokens": 34300328.0, + "step": 939 + }, + { + "epoch": 0.1745589600742804, + "grad_norm": 1.7698110342025757, + "learning_rate": 5.810643564356436e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8379477262496948, + "num_tokens": 34334020.0, + "step": 940 + }, + { + "epoch": 0.17474466109563602, + "grad_norm": 1.8394618034362793, + "learning_rate": 5.816831683168316e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8497229814529419, + "num_tokens": 34362371.0, + "step": 941 + }, + { + "epoch": 0.17493036211699164, + "grad_norm": 1.5448180437088013, + "learning_rate": 5.823019801980198e-07, + "loss": 0.4069, + "mean_token_accuracy": 0.8630392551422119, + "num_tokens": 34396763.0, + "step": 942 + }, + { + "epoch": 0.17511606313834727, + "grad_norm": 1.612247347831726, + "learning_rate": 5.829207920792078e-07, + "loss": 0.437, + "mean_token_accuracy": 0.8559091091156006, + "num_tokens": 34437300.0, + "step": 943 + }, + { + "epoch": 0.1753017641597029, + "grad_norm": 1.6971449851989746, + "learning_rate": 5.83539603960396e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8425693511962891, + "num_tokens": 34470641.0, + "step": 944 + }, + { + "epoch": 0.17548746518105848, + "grad_norm": 1.84141206741333, + "learning_rate": 5.841584158415841e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8397893905639648, + "num_tokens": 34500922.0, + "step": 945 + }, + { + "epoch": 0.1756731662024141, + "grad_norm": 1.640937089920044, + "learning_rate": 5.847772277227722e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.8485012054443359, + "num_tokens": 34538082.0, + "step": 946 + }, + { + "epoch": 0.17585886722376973, + "grad_norm": 1.6801180839538574, + "learning_rate": 5.853960396039603e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.845226526260376, + "num_tokens": 34573002.0, + "step": 947 + }, + { + "epoch": 0.17604456824512535, + "grad_norm": 1.4839342832565308, + "learning_rate": 5.860148514851486e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8507583737373352, + "num_tokens": 34615069.0, + "step": 948 + }, + { + "epoch": 0.17623026926648097, + "grad_norm": 1.6664276123046875, + "learning_rate": 5.866336633663366e-07, + "loss": 0.4245, + "mean_token_accuracy": 0.8555747866630554, + "num_tokens": 34648345.0, + "step": 949 + }, + { + "epoch": 0.1764159702878366, + "grad_norm": 1.6896613836288452, + "learning_rate": 5.872524752475248e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.8526418209075928, + "num_tokens": 34682092.0, + "step": 950 + }, + { + "epoch": 0.1766016713091922, + "grad_norm": 1.5759130716323853, + "learning_rate": 5.878712871287128e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8391634821891785, + "num_tokens": 34722607.0, + "step": 951 + }, + { + "epoch": 0.1767873723305478, + "grad_norm": 1.474629282951355, + "learning_rate": 5.88490099009901e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.854529857635498, + "num_tokens": 34760836.0, + "step": 952 + }, + { + "epoch": 0.17697307335190343, + "grad_norm": 1.3870890140533447, + "learning_rate": 5.891089108910891e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8554726839065552, + "num_tokens": 34805897.0, + "step": 953 + }, + { + "epoch": 0.17715877437325905, + "grad_norm": 1.48003351688385, + "learning_rate": 5.897277227722772e-07, + "loss": 0.429, + "mean_token_accuracy": 0.8588732481002808, + "num_tokens": 34845721.0, + "step": 954 + }, + { + "epoch": 0.17734447539461468, + "grad_norm": 1.6679352521896362, + "learning_rate": 5.903465346534653e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.8439716100692749, + "num_tokens": 34882539.0, + "step": 955 + }, + { + "epoch": 0.1775301764159703, + "grad_norm": 1.6026396751403809, + "learning_rate": 5.909653465346535e-07, + "loss": 0.4292, + "mean_token_accuracy": 0.858051061630249, + "num_tokens": 34917013.0, + "step": 956 + }, + { + "epoch": 0.1777158774373259, + "grad_norm": 1.4797148704528809, + "learning_rate": 5.915841584158415e-07, + "loss": 0.4356, + "mean_token_accuracy": 0.8571473360061646, + "num_tokens": 34955881.0, + "step": 957 + }, + { + "epoch": 0.17790157845868151, + "grad_norm": 1.6787241697311401, + "learning_rate": 5.922029702970297e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8477750420570374, + "num_tokens": 34987385.0, + "step": 958 + }, + { + "epoch": 0.17808727948003714, + "grad_norm": 1.7399308681488037, + "learning_rate": 5.928217821782177e-07, + "loss": 0.527, + "mean_token_accuracy": 0.8347465395927429, + "num_tokens": 35027257.0, + "step": 959 + }, + { + "epoch": 0.17827298050139276, + "grad_norm": 1.6914150714874268, + "learning_rate": 5.934405940594059e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8400813937187195, + "num_tokens": 35062257.0, + "step": 960 + }, + { + "epoch": 0.17845868152274838, + "grad_norm": 1.7024157047271729, + "learning_rate": 5.94059405940594e-07, + "loss": 0.4095, + "mean_token_accuracy": 0.8598370552062988, + "num_tokens": 35095958.0, + "step": 961 + }, + { + "epoch": 0.178644382544104, + "grad_norm": 1.666154384613037, + "learning_rate": 5.946782178217822e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.8399690389633179, + "num_tokens": 35132436.0, + "step": 962 + }, + { + "epoch": 0.1788300835654596, + "grad_norm": 1.5609945058822632, + "learning_rate": 5.952970297029703e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.849808394908905, + "num_tokens": 35170626.0, + "step": 963 + }, + { + "epoch": 0.17901578458681522, + "grad_norm": 1.576754093170166, + "learning_rate": 5.959158415841584e-07, + "loss": 0.4562, + "mean_token_accuracy": 0.8476836681365967, + "num_tokens": 35208063.0, + "step": 964 + }, + { + "epoch": 0.17920148560817084, + "grad_norm": 1.542078971862793, + "learning_rate": 5.965346534653465e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8459516763687134, + "num_tokens": 35248867.0, + "step": 965 + }, + { + "epoch": 0.17938718662952646, + "grad_norm": 1.6643590927124023, + "learning_rate": 5.971534653465347e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8409870862960815, + "num_tokens": 35283326.0, + "step": 966 + }, + { + "epoch": 0.17957288765088208, + "grad_norm": 1.552942156791687, + "learning_rate": 5.977722772277227e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8358204364776611, + "num_tokens": 35322262.0, + "step": 967 + }, + { + "epoch": 0.1797585886722377, + "grad_norm": 1.5336072444915771, + "learning_rate": 5.983910891089109e-07, + "loss": 0.465, + "mean_token_accuracy": 0.8461126089096069, + "num_tokens": 35359560.0, + "step": 968 + }, + { + "epoch": 0.1799442896935933, + "grad_norm": 1.695713996887207, + "learning_rate": 5.99009900990099e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8488017320632935, + "num_tokens": 35390380.0, + "step": 969 + }, + { + "epoch": 0.18012999071494892, + "grad_norm": 1.755921721458435, + "learning_rate": 5.996287128712871e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8383229970932007, + "num_tokens": 35421132.0, + "step": 970 + }, + { + "epoch": 0.18031569173630455, + "grad_norm": 1.4486185312271118, + "learning_rate": 6.002475247524752e-07, + "loss": 0.4237, + "mean_token_accuracy": 0.8591073751449585, + "num_tokens": 35463817.0, + "step": 971 + }, + { + "epoch": 0.18050139275766017, + "grad_norm": 1.567854881286621, + "learning_rate": 6.008663366336633e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.8498318195343018, + "num_tokens": 35500560.0, + "step": 972 + }, + { + "epoch": 0.1806870937790158, + "grad_norm": 1.503872036933899, + "learning_rate": 6.014851485148514e-07, + "loss": 0.3996, + "mean_token_accuracy": 0.868550181388855, + "num_tokens": 35539703.0, + "step": 973 + }, + { + "epoch": 0.1808727948003714, + "grad_norm": 1.5592405796051025, + "learning_rate": 6.021039603960396e-07, + "loss": 0.4677, + "mean_token_accuracy": 0.8439364433288574, + "num_tokens": 35580797.0, + "step": 974 + }, + { + "epoch": 0.181058495821727, + "grad_norm": 1.6235295534133911, + "learning_rate": 6.027227722772277e-07, + "loss": 0.4022, + "mean_token_accuracy": 0.8656468391418457, + "num_tokens": 35611441.0, + "step": 975 + }, + { + "epoch": 0.18124419684308263, + "grad_norm": 1.5941414833068848, + "learning_rate": 6.033415841584159e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8494111895561218, + "num_tokens": 35648933.0, + "step": 976 + }, + { + "epoch": 0.18142989786443825, + "grad_norm": 1.752600073814392, + "learning_rate": 6.03960396039604e-07, + "loss": 0.5051, + "mean_token_accuracy": 0.8354077339172363, + "num_tokens": 35681272.0, + "step": 977 + }, + { + "epoch": 0.18161559888579387, + "grad_norm": 1.7367981672286987, + "learning_rate": 6.045792079207921e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8445038795471191, + "num_tokens": 35713266.0, + "step": 978 + }, + { + "epoch": 0.1818012999071495, + "grad_norm": 1.6868551969528198, + "learning_rate": 6.051980198019802e-07, + "loss": 0.4637, + "mean_token_accuracy": 0.8522468209266663, + "num_tokens": 35744938.0, + "step": 979 + }, + { + "epoch": 0.18198700092850512, + "grad_norm": 1.6308395862579346, + "learning_rate": 6.058168316831683e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8396337628364563, + "num_tokens": 35781998.0, + "step": 980 + }, + { + "epoch": 0.18217270194986074, + "grad_norm": 1.514255166053772, + "learning_rate": 6.064356435643564e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8511248230934143, + "num_tokens": 35821030.0, + "step": 981 + }, + { + "epoch": 0.18235840297121633, + "grad_norm": 1.5032085180282593, + "learning_rate": 6.070544554455446e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.856899619102478, + "num_tokens": 35860827.0, + "step": 982 + }, + { + "epoch": 0.18254410399257195, + "grad_norm": 1.4880205392837524, + "learning_rate": 6.076732673267326e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8557377457618713, + "num_tokens": 35898746.0, + "step": 983 + }, + { + "epoch": 0.18272980501392758, + "grad_norm": 1.5895519256591797, + "learning_rate": 6.082920792079208e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.8487152457237244, + "num_tokens": 35932808.0, + "step": 984 + }, + { + "epoch": 0.1829155060352832, + "grad_norm": 1.6456941366195679, + "learning_rate": 6.089108910891088e-07, + "loss": 0.4005, + "mean_token_accuracy": 0.8675546646118164, + "num_tokens": 35967657.0, + "step": 985 + }, + { + "epoch": 0.18310120705663882, + "grad_norm": 1.5325769186019897, + "learning_rate": 6.09529702970297e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8535180687904358, + "num_tokens": 36006635.0, + "step": 986 + }, + { + "epoch": 0.18328690807799444, + "grad_norm": 1.5549039840698242, + "learning_rate": 6.101485148514851e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.8529176712036133, + "num_tokens": 36047235.0, + "step": 987 + }, + { + "epoch": 0.18347260909935004, + "grad_norm": 1.802778959274292, + "learning_rate": 6.107673267326733e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8371241092681885, + "num_tokens": 36077784.0, + "step": 988 + }, + { + "epoch": 0.18365831012070566, + "grad_norm": 1.5702226161956787, + "learning_rate": 6.113861386138614e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8459147214889526, + "num_tokens": 36114798.0, + "step": 989 + }, + { + "epoch": 0.18384401114206128, + "grad_norm": 1.6340985298156738, + "learning_rate": 6.120049504950496e-07, + "loss": 0.4255, + "mean_token_accuracy": 0.8525382280349731, + "num_tokens": 36148011.0, + "step": 990 + }, + { + "epoch": 0.1840297121634169, + "grad_norm": 1.463714361190796, + "learning_rate": 6.126237623762376e-07, + "loss": 0.4377, + "mean_token_accuracy": 0.8532806634902954, + "num_tokens": 36187775.0, + "step": 991 + }, + { + "epoch": 0.18421541318477253, + "grad_norm": 1.6529557704925537, + "learning_rate": 6.132425742574258e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8455798029899597, + "num_tokens": 36221271.0, + "step": 992 + }, + { + "epoch": 0.18440111420612815, + "grad_norm": 1.549250841140747, + "learning_rate": 6.138613861386138e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8416074514389038, + "num_tokens": 36261006.0, + "step": 993 + }, + { + "epoch": 0.18458681522748374, + "grad_norm": 1.5663448572158813, + "learning_rate": 6.14480198019802e-07, + "loss": 0.4411, + "mean_token_accuracy": 0.8558100461959839, + "num_tokens": 36297868.0, + "step": 994 + }, + { + "epoch": 0.18477251624883936, + "grad_norm": 1.6363170146942139, + "learning_rate": 6.150990099009901e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8506168127059937, + "num_tokens": 36333328.0, + "step": 995 + }, + { + "epoch": 0.18495821727019499, + "grad_norm": 1.5541541576385498, + "learning_rate": 6.157178217821782e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8564860224723816, + "num_tokens": 36368122.0, + "step": 996 + }, + { + "epoch": 0.1851439182915506, + "grad_norm": 1.5003691911697388, + "learning_rate": 6.163366336633663e-07, + "loss": 0.4537, + "mean_token_accuracy": 0.8520895838737488, + "num_tokens": 36405961.0, + "step": 997 + }, + { + "epoch": 0.18532961931290623, + "grad_norm": 1.567535161972046, + "learning_rate": 6.169554455445544e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8445079326629639, + "num_tokens": 36441242.0, + "step": 998 + }, + { + "epoch": 0.18551532033426185, + "grad_norm": 1.3495250940322876, + "learning_rate": 6.175742574257425e-07, + "loss": 0.4378, + "mean_token_accuracy": 0.8561211824417114, + "num_tokens": 36486051.0, + "step": 999 + }, + { + "epoch": 0.18570102135561745, + "grad_norm": 1.7766878604888916, + "learning_rate": 6.181930693069307e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8476780652999878, + "num_tokens": 36516904.0, + "step": 1000 + }, + { + "epoch": 0.18588672237697307, + "grad_norm": 1.5174989700317383, + "learning_rate": 6.188118811881187e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8467482328414917, + "num_tokens": 36555861.0, + "step": 1001 + }, + { + "epoch": 0.1860724233983287, + "grad_norm": 1.5733485221862793, + "learning_rate": 6.19430693069307e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8377522230148315, + "num_tokens": 36593271.0, + "step": 1002 + }, + { + "epoch": 0.1862581244196843, + "grad_norm": 1.7608661651611328, + "learning_rate": 6.200495049504951e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8526749014854431, + "num_tokens": 36623918.0, + "step": 1003 + }, + { + "epoch": 0.18644382544103993, + "grad_norm": 1.568830966949463, + "learning_rate": 6.206683168316832e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8490147590637207, + "num_tokens": 36661941.0, + "step": 1004 + }, + { + "epoch": 0.18662952646239556, + "grad_norm": 1.5882971286773682, + "learning_rate": 6.212871287128713e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.8546783924102783, + "num_tokens": 36695836.0, + "step": 1005 + }, + { + "epoch": 0.18681522748375115, + "grad_norm": 1.6062170267105103, + "learning_rate": 6.219059405940594e-07, + "loss": 0.4141, + "mean_token_accuracy": 0.8603285551071167, + "num_tokens": 36730470.0, + "step": 1006 + }, + { + "epoch": 0.18700092850510677, + "grad_norm": 1.6674041748046875, + "learning_rate": 6.225247524752475e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8414156436920166, + "num_tokens": 36768461.0, + "step": 1007 + }, + { + "epoch": 0.1871866295264624, + "grad_norm": 1.4419097900390625, + "learning_rate": 6.231435643564357e-07, + "loss": 0.4187, + "mean_token_accuracy": 0.8615774512290955, + "num_tokens": 36808217.0, + "step": 1008 + }, + { + "epoch": 0.18737233054781802, + "grad_norm": 1.685997724533081, + "learning_rate": 6.237623762376237e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.8456472158432007, + "num_tokens": 36841857.0, + "step": 1009 + }, + { + "epoch": 0.18755803156917364, + "grad_norm": 1.703232765197754, + "learning_rate": 6.243811881188119e-07, + "loss": 0.4441, + "mean_token_accuracy": 0.8554261326789856, + "num_tokens": 36877178.0, + "step": 1010 + }, + { + "epoch": 0.18774373259052926, + "grad_norm": 1.5847837924957275, + "learning_rate": 6.249999999999999e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8445423245429993, + "num_tokens": 36916843.0, + "step": 1011 + }, + { + "epoch": 0.18792943361188486, + "grad_norm": 1.7337775230407715, + "learning_rate": 6.25618811881188e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.8270413279533386, + "num_tokens": 36951932.0, + "step": 1012 + }, + { + "epoch": 0.18811513463324048, + "grad_norm": 1.6841932535171509, + "learning_rate": 6.262376237623762e-07, + "loss": 0.4205, + "mean_token_accuracy": 0.8600308299064636, + "num_tokens": 36988884.0, + "step": 1013 + }, + { + "epoch": 0.1883008356545961, + "grad_norm": 1.649070143699646, + "learning_rate": 6.268564356435642e-07, + "loss": 0.4426, + "mean_token_accuracy": 0.8509535789489746, + "num_tokens": 37023301.0, + "step": 1014 + }, + { + "epoch": 0.18848653667595172, + "grad_norm": 1.5348325967788696, + "learning_rate": 6.274752475247525e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8433470726013184, + "num_tokens": 37061609.0, + "step": 1015 + }, + { + "epoch": 0.18867223769730734, + "grad_norm": 1.6125128269195557, + "learning_rate": 6.280940594059406e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8394169211387634, + "num_tokens": 37098611.0, + "step": 1016 + }, + { + "epoch": 0.18885793871866297, + "grad_norm": 1.7742549180984497, + "learning_rate": 6.287128712871287e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8408404588699341, + "num_tokens": 37134130.0, + "step": 1017 + }, + { + "epoch": 0.18904363974001856, + "grad_norm": 1.5654146671295166, + "learning_rate": 6.293316831683168e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8445194959640503, + "num_tokens": 37174384.0, + "step": 1018 + }, + { + "epoch": 0.18922934076137418, + "grad_norm": 1.5437926054000854, + "learning_rate": 6.299504950495049e-07, + "loss": 0.3729, + "mean_token_accuracy": 0.8723808526992798, + "num_tokens": 37209778.0, + "step": 1019 + }, + { + "epoch": 0.1894150417827298, + "grad_norm": 1.7374638319015503, + "learning_rate": 6.30569306930693e-07, + "loss": 0.432, + "mean_token_accuracy": 0.8563315272331238, + "num_tokens": 37242088.0, + "step": 1020 + }, + { + "epoch": 0.18960074280408543, + "grad_norm": 1.5550373792648315, + "learning_rate": 6.311881188118812e-07, + "loss": 0.4164, + "mean_token_accuracy": 0.8618055582046509, + "num_tokens": 37280432.0, + "step": 1021 + }, + { + "epoch": 0.18978644382544105, + "grad_norm": 1.8649381399154663, + "learning_rate": 6.318069306930692e-07, + "loss": 0.4677, + "mean_token_accuracy": 0.8432972431182861, + "num_tokens": 37309122.0, + "step": 1022 + }, + { + "epoch": 0.18997214484679667, + "grad_norm": 1.7063028812408447, + "learning_rate": 6.324257425742574e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.8447306752204895, + "num_tokens": 37341845.0, + "step": 1023 + }, + { + "epoch": 0.19015784586815226, + "grad_norm": 1.577916145324707, + "learning_rate": 6.330445544554454e-07, + "loss": 0.4428, + "mean_token_accuracy": 0.8528627157211304, + "num_tokens": 37380809.0, + "step": 1024 + }, + { + "epoch": 0.1903435468895079, + "grad_norm": 1.6234052181243896, + "learning_rate": 6.336633663366336e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8408846855163574, + "num_tokens": 37418899.0, + "step": 1025 + }, + { + "epoch": 0.1905292479108635, + "grad_norm": 1.6360273361206055, + "learning_rate": 6.342821782178217e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8441818952560425, + "num_tokens": 37455316.0, + "step": 1026 + }, + { + "epoch": 0.19071494893221913, + "grad_norm": 1.533486008644104, + "learning_rate": 6.349009900990098e-07, + "loss": 0.4185, + "mean_token_accuracy": 0.863452672958374, + "num_tokens": 37493845.0, + "step": 1027 + }, + { + "epoch": 0.19090064995357475, + "grad_norm": 1.6163829565048218, + "learning_rate": 6.35519801980198e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.850975751876831, + "num_tokens": 37529107.0, + "step": 1028 + }, + { + "epoch": 0.19108635097493037, + "grad_norm": 1.5597516298294067, + "learning_rate": 6.361386138613862e-07, + "loss": 0.4353, + "mean_token_accuracy": 0.8544365167617798, + "num_tokens": 37565438.0, + "step": 1029 + }, + { + "epoch": 0.19127205199628597, + "grad_norm": 1.5789943933486938, + "learning_rate": 6.367574257425742e-07, + "loss": 0.4226, + "mean_token_accuracy": 0.8599667549133301, + "num_tokens": 37604945.0, + "step": 1030 + }, + { + "epoch": 0.1914577530176416, + "grad_norm": 1.6891034841537476, + "learning_rate": 6.373762376237624e-07, + "loss": 0.4453, + "mean_token_accuracy": 0.8497058153152466, + "num_tokens": 37639026.0, + "step": 1031 + }, + { + "epoch": 0.1916434540389972, + "grad_norm": 1.4989142417907715, + "learning_rate": 6.379950495049504e-07, + "loss": 0.4361, + "mean_token_accuracy": 0.8512151837348938, + "num_tokens": 37678670.0, + "step": 1032 + }, + { + "epoch": 0.19182915506035284, + "grad_norm": 1.5730907917022705, + "learning_rate": 6.386138613861386e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.849225640296936, + "num_tokens": 37717037.0, + "step": 1033 + }, + { + "epoch": 0.19201485608170846, + "grad_norm": 1.5673516988754272, + "learning_rate": 6.392326732673267e-07, + "loss": 0.4769, + "mean_token_accuracy": 0.8429852724075317, + "num_tokens": 37753990.0, + "step": 1034 + }, + { + "epoch": 0.19220055710306408, + "grad_norm": 1.5553169250488281, + "learning_rate": 6.398514851485148e-07, + "loss": 0.421, + "mean_token_accuracy": 0.8606817126274109, + "num_tokens": 37792363.0, + "step": 1035 + }, + { + "epoch": 0.19238625812441967, + "grad_norm": 1.583483099937439, + "learning_rate": 6.404702970297029e-07, + "loss": 0.43, + "mean_token_accuracy": 0.8537503480911255, + "num_tokens": 37827497.0, + "step": 1036 + }, + { + "epoch": 0.1925719591457753, + "grad_norm": 1.578570008277893, + "learning_rate": 6.41089108910891e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8505498170852661, + "num_tokens": 37863765.0, + "step": 1037 + }, + { + "epoch": 0.19275766016713092, + "grad_norm": 1.5355257987976074, + "learning_rate": 6.417079207920791e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8485370874404907, + "num_tokens": 37900943.0, + "step": 1038 + }, + { + "epoch": 0.19294336118848654, + "grad_norm": 1.6547441482543945, + "learning_rate": 6.423267326732673e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.849339485168457, + "num_tokens": 37937150.0, + "step": 1039 + }, + { + "epoch": 0.19312906220984216, + "grad_norm": 1.7000854015350342, + "learning_rate": 6.429455445544553e-07, + "loss": 0.4105, + "mean_token_accuracy": 0.8627597093582153, + "num_tokens": 37971284.0, + "step": 1040 + }, + { + "epoch": 0.19331476323119778, + "grad_norm": 1.6052746772766113, + "learning_rate": 6.435643564356436e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8449893593788147, + "num_tokens": 38011022.0, + "step": 1041 + }, + { + "epoch": 0.19350046425255338, + "grad_norm": 1.7249963283538818, + "learning_rate": 6.441831683168317e-07, + "loss": 0.3828, + "mean_token_accuracy": 0.8706254959106445, + "num_tokens": 38042847.0, + "step": 1042 + }, + { + "epoch": 0.193686165273909, + "grad_norm": 1.6313555240631104, + "learning_rate": 6.448019801980198e-07, + "loss": 0.4111, + "mean_token_accuracy": 0.8644310832023621, + "num_tokens": 38073494.0, + "step": 1043 + }, + { + "epoch": 0.19387186629526462, + "grad_norm": 1.51510751247406, + "learning_rate": 6.454207920792079e-07, + "loss": 0.4395, + "mean_token_accuracy": 0.8529525399208069, + "num_tokens": 38112263.0, + "step": 1044 + }, + { + "epoch": 0.19405756731662024, + "grad_norm": 1.5230090618133545, + "learning_rate": 6.46039603960396e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8485110998153687, + "num_tokens": 38149041.0, + "step": 1045 + }, + { + "epoch": 0.19424326833797587, + "grad_norm": 1.568281888961792, + "learning_rate": 6.466584158415841e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8458449244499207, + "num_tokens": 38186220.0, + "step": 1046 + }, + { + "epoch": 0.1944289693593315, + "grad_norm": 1.6159579753875732, + "learning_rate": 6.472772277227723e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.8491816520690918, + "num_tokens": 38226697.0, + "step": 1047 + }, + { + "epoch": 0.19461467038068708, + "grad_norm": 1.5066101551055908, + "learning_rate": 6.478960396039603e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.8369531035423279, + "num_tokens": 38269213.0, + "step": 1048 + }, + { + "epoch": 0.1948003714020427, + "grad_norm": 1.6488622426986694, + "learning_rate": 6.485148514851485e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8408534526824951, + "num_tokens": 38303886.0, + "step": 1049 + }, + { + "epoch": 0.19498607242339833, + "grad_norm": 1.4796340465545654, + "learning_rate": 6.491336633663366e-07, + "loss": 0.3847, + "mean_token_accuracy": 0.8708547353744507, + "num_tokens": 38339149.0, + "step": 1050 + }, + { + "epoch": 0.19517177344475395, + "grad_norm": 1.611321210861206, + "learning_rate": 6.497524752475247e-07, + "loss": 0.41, + "mean_token_accuracy": 0.8614877462387085, + "num_tokens": 38373789.0, + "step": 1051 + }, + { + "epoch": 0.19535747446610957, + "grad_norm": 1.4973583221435547, + "learning_rate": 6.503712871287128e-07, + "loss": 0.4578, + "mean_token_accuracy": 0.8493291139602661, + "num_tokens": 38416056.0, + "step": 1052 + }, + { + "epoch": 0.1955431754874652, + "grad_norm": 1.8295314311981201, + "learning_rate": 6.509900990099009e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8417824506759644, + "num_tokens": 38447702.0, + "step": 1053 + }, + { + "epoch": 0.1957288765088208, + "grad_norm": 1.7006174325942993, + "learning_rate": 6.51608910891089e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.836806058883667, + "num_tokens": 38485330.0, + "step": 1054 + }, + { + "epoch": 0.1959145775301764, + "grad_norm": 1.5153874158859253, + "learning_rate": 6.522277227722773e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.8577568531036377, + "num_tokens": 38524864.0, + "step": 1055 + }, + { + "epoch": 0.19610027855153203, + "grad_norm": 1.5340391397476196, + "learning_rate": 6.528465346534653e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8548824191093445, + "num_tokens": 38565080.0, + "step": 1056 + }, + { + "epoch": 0.19628597957288765, + "grad_norm": 1.6395245790481567, + "learning_rate": 6.534653465346535e-07, + "loss": 0.4451, + "mean_token_accuracy": 0.8503443002700806, + "num_tokens": 38601767.0, + "step": 1057 + }, + { + "epoch": 0.19647168059424328, + "grad_norm": 1.7074228525161743, + "learning_rate": 6.540841584158415e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8480924367904663, + "num_tokens": 38634773.0, + "step": 1058 + }, + { + "epoch": 0.1966573816155989, + "grad_norm": 1.5509002208709717, + "learning_rate": 6.547029702970297e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8490513563156128, + "num_tokens": 38672311.0, + "step": 1059 + }, + { + "epoch": 0.1968430826369545, + "grad_norm": 1.4811805486679077, + "learning_rate": 6.553217821782178e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8547846078872681, + "num_tokens": 38714321.0, + "step": 1060 + }, + { + "epoch": 0.1970287836583101, + "grad_norm": 1.682066559791565, + "learning_rate": 6.559405940594059e-07, + "loss": 0.4293, + "mean_token_accuracy": 0.8582332134246826, + "num_tokens": 38746656.0, + "step": 1061 + }, + { + "epoch": 0.19721448467966574, + "grad_norm": 1.648508906364441, + "learning_rate": 6.56559405940594e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8466317653656006, + "num_tokens": 38782870.0, + "step": 1062 + }, + { + "epoch": 0.19740018570102136, + "grad_norm": 1.6756004095077515, + "learning_rate": 6.571782178217822e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8415970802307129, + "num_tokens": 38817429.0, + "step": 1063 + }, + { + "epoch": 0.19758588672237698, + "grad_norm": 1.7549636363983154, + "learning_rate": 6.577970297029702e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8487007021903992, + "num_tokens": 38849027.0, + "step": 1064 + }, + { + "epoch": 0.1977715877437326, + "grad_norm": 1.6728897094726562, + "learning_rate": 6.584158415841584e-07, + "loss": 0.4439, + "mean_token_accuracy": 0.852776050567627, + "num_tokens": 38881586.0, + "step": 1065 + }, + { + "epoch": 0.1979572887650882, + "grad_norm": 1.5708699226379395, + "learning_rate": 6.590346534653464e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.854221761226654, + "num_tokens": 38920601.0, + "step": 1066 + }, + { + "epoch": 0.19814298978644382, + "grad_norm": 1.8577874898910522, + "learning_rate": 6.596534653465346e-07, + "loss": 0.4969, + "mean_token_accuracy": 0.835719108581543, + "num_tokens": 38955744.0, + "step": 1067 + }, + { + "epoch": 0.19832869080779944, + "grad_norm": 1.5484918355941772, + "learning_rate": 6.602722772277228e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8516493439674377, + "num_tokens": 38992714.0, + "step": 1068 + }, + { + "epoch": 0.19851439182915506, + "grad_norm": 1.8827731609344482, + "learning_rate": 6.608910891089109e-07, + "loss": 0.488, + "mean_token_accuracy": 0.842646062374115, + "num_tokens": 39022432.0, + "step": 1069 + }, + { + "epoch": 0.19870009285051068, + "grad_norm": 1.4943134784698486, + "learning_rate": 6.61509900990099e-07, + "loss": 0.4745, + "mean_token_accuracy": 0.8414257764816284, + "num_tokens": 39063536.0, + "step": 1070 + }, + { + "epoch": 0.1988857938718663, + "grad_norm": 1.6386759281158447, + "learning_rate": 6.621287128712872e-07, + "loss": 0.4155, + "mean_token_accuracy": 0.8626868724822998, + "num_tokens": 39099431.0, + "step": 1071 + }, + { + "epoch": 0.1990714948932219, + "grad_norm": 1.670763611793518, + "learning_rate": 6.627475247524752e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8457156419754028, + "num_tokens": 39135341.0, + "step": 1072 + }, + { + "epoch": 0.19925719591457752, + "grad_norm": 1.5421013832092285, + "learning_rate": 6.633663366336634e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.844902515411377, + "num_tokens": 39175874.0, + "step": 1073 + }, + { + "epoch": 0.19944289693593314, + "grad_norm": 1.4195808172225952, + "learning_rate": 6.639851485148514e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.853884756565094, + "num_tokens": 39221652.0, + "step": 1074 + }, + { + "epoch": 0.19962859795728877, + "grad_norm": 1.5359517335891724, + "learning_rate": 6.646039603960396e-07, + "loss": 0.3935, + "mean_token_accuracy": 0.867682695388794, + "num_tokens": 39258026.0, + "step": 1075 + }, + { + "epoch": 0.1998142989786444, + "grad_norm": 1.4542032480239868, + "learning_rate": 6.652227722772277e-07, + "loss": 0.4163, + "mean_token_accuracy": 0.8612346053123474, + "num_tokens": 39300248.0, + "step": 1076 + }, + { + "epoch": 0.2, + "grad_norm": 1.5730195045471191, + "learning_rate": 6.658415841584158e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8433421850204468, + "num_tokens": 39337596.0, + "step": 1077 + }, + { + "epoch": 0.2001857010213556, + "grad_norm": 1.6562104225158691, + "learning_rate": 6.664603960396039e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8419151306152344, + "num_tokens": 39374393.0, + "step": 1078 + }, + { + "epoch": 0.20037140204271123, + "grad_norm": 1.6888447999954224, + "learning_rate": 6.67079207920792e-07, + "loss": 0.4643, + "mean_token_accuracy": 0.8447871804237366, + "num_tokens": 39409535.0, + "step": 1079 + }, + { + "epoch": 0.20055710306406685, + "grad_norm": 1.655669927597046, + "learning_rate": 6.676980198019801e-07, + "loss": 0.4222, + "mean_token_accuracy": 0.8568161725997925, + "num_tokens": 39443114.0, + "step": 1080 + }, + { + "epoch": 0.20074280408542247, + "grad_norm": 1.6539496183395386, + "learning_rate": 6.683168316831684e-07, + "loss": 0.4836, + "mean_token_accuracy": 0.839064359664917, + "num_tokens": 39477161.0, + "step": 1081 + }, + { + "epoch": 0.2009285051067781, + "grad_norm": 1.6188786029815674, + "learning_rate": 6.689356435643564e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8459205627441406, + "num_tokens": 39512809.0, + "step": 1082 + }, + { + "epoch": 0.20111420612813372, + "grad_norm": 1.604262113571167, + "learning_rate": 6.695544554455446e-07, + "loss": 0.4378, + "mean_token_accuracy": 0.8525856733322144, + "num_tokens": 39549039.0, + "step": 1083 + }, + { + "epoch": 0.2012999071494893, + "grad_norm": 1.6899443864822388, + "learning_rate": 6.701732673267327e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.8447844982147217, + "num_tokens": 39584101.0, + "step": 1084 + }, + { + "epoch": 0.20148560817084493, + "grad_norm": 1.5558816194534302, + "learning_rate": 6.707920792079208e-07, + "loss": 0.4314, + "mean_token_accuracy": 0.8555775284767151, + "num_tokens": 39623408.0, + "step": 1085 + }, + { + "epoch": 0.20167130919220055, + "grad_norm": 1.7476452589035034, + "learning_rate": 6.714108910891089e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8375862836837769, + "num_tokens": 39653825.0, + "step": 1086 + }, + { + "epoch": 0.20185701021355618, + "grad_norm": 1.6777524948120117, + "learning_rate": 6.72029702970297e-07, + "loss": 0.411, + "mean_token_accuracy": 0.8601045608520508, + "num_tokens": 39686915.0, + "step": 1087 + }, + { + "epoch": 0.2020427112349118, + "grad_norm": 1.6628875732421875, + "learning_rate": 6.726485148514851e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8433610200881958, + "num_tokens": 39724415.0, + "step": 1088 + }, + { + "epoch": 0.20222841225626742, + "grad_norm": 1.5587161779403687, + "learning_rate": 6.732673267326733e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8484734296798706, + "num_tokens": 39760520.0, + "step": 1089 + }, + { + "epoch": 0.20241411327762301, + "grad_norm": 1.509706735610962, + "learning_rate": 6.738861386138613e-07, + "loss": 0.4127, + "mean_token_accuracy": 0.8607376217842102, + "num_tokens": 39796654.0, + "step": 1090 + }, + { + "epoch": 0.20259981429897864, + "grad_norm": 1.5894336700439453, + "learning_rate": 6.745049504950495e-07, + "loss": 0.4459, + "mean_token_accuracy": 0.8529947996139526, + "num_tokens": 39833240.0, + "step": 1091 + }, + { + "epoch": 0.20278551532033426, + "grad_norm": 1.6240277290344238, + "learning_rate": 6.751237623762375e-07, + "loss": 0.5037, + "mean_token_accuracy": 0.8356932401657104, + "num_tokens": 39869754.0, + "step": 1092 + }, + { + "epoch": 0.20297121634168988, + "grad_norm": 1.5739657878875732, + "learning_rate": 6.757425742574257e-07, + "loss": 0.4048, + "mean_token_accuracy": 0.8648951053619385, + "num_tokens": 39901320.0, + "step": 1093 + }, + { + "epoch": 0.2031569173630455, + "grad_norm": 1.7271534204483032, + "learning_rate": 6.763613861386139e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8408010601997375, + "num_tokens": 39934555.0, + "step": 1094 + }, + { + "epoch": 0.20334261838440112, + "grad_norm": 1.606431007385254, + "learning_rate": 6.76980198019802e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.8459320068359375, + "num_tokens": 39970096.0, + "step": 1095 + }, + { + "epoch": 0.20352831940575672, + "grad_norm": 1.4811853170394897, + "learning_rate": 6.775990099009901e-07, + "loss": 0.4556, + "mean_token_accuracy": 0.8496288657188416, + "num_tokens": 40010950.0, + "step": 1096 + }, + { + "epoch": 0.20371402042711234, + "grad_norm": 1.5850340127944946, + "learning_rate": 6.782178217821783e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8505072593688965, + "num_tokens": 40044953.0, + "step": 1097 + }, + { + "epoch": 0.20389972144846796, + "grad_norm": 1.6493194103240967, + "learning_rate": 6.788366336633663e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8497067093849182, + "num_tokens": 40080042.0, + "step": 1098 + }, + { + "epoch": 0.20408542246982359, + "grad_norm": 1.5271116495132446, + "learning_rate": 6.794554455445545e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.853005588054657, + "num_tokens": 40116326.0, + "step": 1099 + }, + { + "epoch": 0.2042711234911792, + "grad_norm": 1.6632980108261108, + "learning_rate": 6.800742574257425e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8415201306343079, + "num_tokens": 40151576.0, + "step": 1100 + }, + { + "epoch": 0.20445682451253483, + "grad_norm": 1.7400171756744385, + "learning_rate": 6.806930693069307e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8506617546081543, + "num_tokens": 40188178.0, + "step": 1101 + }, + { + "epoch": 0.20464252553389042, + "grad_norm": 1.3842891454696655, + "learning_rate": 6.813118811881188e-07, + "loss": 0.4411, + "mean_token_accuracy": 0.85271155834198, + "num_tokens": 40230615.0, + "step": 1102 + }, + { + "epoch": 0.20482822655524605, + "grad_norm": 1.6872882843017578, + "learning_rate": 6.819306930693069e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8390445113182068, + "num_tokens": 40265255.0, + "step": 1103 + }, + { + "epoch": 0.20501392757660167, + "grad_norm": 1.6279590129852295, + "learning_rate": 6.82549504950495e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.8478585481643677, + "num_tokens": 40298951.0, + "step": 1104 + }, + { + "epoch": 0.2051996285979573, + "grad_norm": 1.5929555892944336, + "learning_rate": 6.831683168316831e-07, + "loss": 0.4249, + "mean_token_accuracy": 0.8567818403244019, + "num_tokens": 40337146.0, + "step": 1105 + }, + { + "epoch": 0.2053853296193129, + "grad_norm": 1.8148475885391235, + "learning_rate": 6.837871287128712e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8583570122718811, + "num_tokens": 40370271.0, + "step": 1106 + }, + { + "epoch": 0.20557103064066853, + "grad_norm": 1.6097173690795898, + "learning_rate": 6.844059405940595e-07, + "loss": 0.5236, + "mean_token_accuracy": 0.8355687856674194, + "num_tokens": 40408549.0, + "step": 1107 + }, + { + "epoch": 0.20575673166202413, + "grad_norm": 1.5856618881225586, + "learning_rate": 6.850247524752475e-07, + "loss": 0.4201, + "mean_token_accuracy": 0.8559771180152893, + "num_tokens": 40443827.0, + "step": 1108 + }, + { + "epoch": 0.20594243268337975, + "grad_norm": 1.4706069231033325, + "learning_rate": 6.856435643564357e-07, + "loss": 0.3892, + "mean_token_accuracy": 0.8660436868667603, + "num_tokens": 40480181.0, + "step": 1109 + }, + { + "epoch": 0.20612813370473537, + "grad_norm": 1.7757692337036133, + "learning_rate": 6.862623762376238e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8429962396621704, + "num_tokens": 40515756.0, + "step": 1110 + }, + { + "epoch": 0.206313834726091, + "grad_norm": 1.4604507684707642, + "learning_rate": 6.868811881188119e-07, + "loss": 0.3953, + "mean_token_accuracy": 0.8678210377693176, + "num_tokens": 40554528.0, + "step": 1111 + }, + { + "epoch": 0.20649953574744662, + "grad_norm": 1.6683995723724365, + "learning_rate": 6.875e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8446931838989258, + "num_tokens": 40587871.0, + "step": 1112 + }, + { + "epoch": 0.20668523676880224, + "grad_norm": 1.509340524673462, + "learning_rate": 6.88118811881188e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8405922651290894, + "num_tokens": 40631318.0, + "step": 1113 + }, + { + "epoch": 0.20687093779015783, + "grad_norm": 1.5644243955612183, + "learning_rate": 6.887376237623762e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.8362812995910645, + "num_tokens": 40667210.0, + "step": 1114 + }, + { + "epoch": 0.20705663881151345, + "grad_norm": 1.6295642852783203, + "learning_rate": 6.893564356435643e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8482750058174133, + "num_tokens": 40703188.0, + "step": 1115 + }, + { + "epoch": 0.20724233983286908, + "grad_norm": 1.7265169620513916, + "learning_rate": 6.899752475247524e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8488937020301819, + "num_tokens": 40733575.0, + "step": 1116 + }, + { + "epoch": 0.2074280408542247, + "grad_norm": 1.6310889720916748, + "learning_rate": 6.905940594059405e-07, + "loss": 0.4472, + "mean_token_accuracy": 0.8500089645385742, + "num_tokens": 40768832.0, + "step": 1117 + }, + { + "epoch": 0.20761374187558032, + "grad_norm": 1.6918903589248657, + "learning_rate": 6.912128712871287e-07, + "loss": 0.4755, + "mean_token_accuracy": 0.8427954912185669, + "num_tokens": 40801992.0, + "step": 1118 + }, + { + "epoch": 0.20779944289693594, + "grad_norm": 1.6956377029418945, + "learning_rate": 6.918316831683167e-07, + "loss": 0.3812, + "mean_token_accuracy": 0.8704943656921387, + "num_tokens": 40835182.0, + "step": 1119 + }, + { + "epoch": 0.20798514391829154, + "grad_norm": 1.5953071117401123, + "learning_rate": 6.924504950495049e-07, + "loss": 0.4266, + "mean_token_accuracy": 0.8551082015037537, + "num_tokens": 40869535.0, + "step": 1120 + }, + { + "epoch": 0.20817084493964716, + "grad_norm": 1.5652891397476196, + "learning_rate": 6.93069306930693e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8478295803070068, + "num_tokens": 40906082.0, + "step": 1121 + }, + { + "epoch": 0.20835654596100278, + "grad_norm": 1.610605001449585, + "learning_rate": 6.936881188118812e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8382223844528198, + "num_tokens": 40944487.0, + "step": 1122 + }, + { + "epoch": 0.2085422469823584, + "grad_norm": 1.5792624950408936, + "learning_rate": 6.943069306930693e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.8390130400657654, + "num_tokens": 40982439.0, + "step": 1123 + }, + { + "epoch": 0.20872794800371403, + "grad_norm": 1.6500155925750732, + "learning_rate": 6.949257425742574e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8369617462158203, + "num_tokens": 41021793.0, + "step": 1124 + }, + { + "epoch": 0.20891364902506965, + "grad_norm": 1.4921289682388306, + "learning_rate": 6.955445544554455e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8583403825759888, + "num_tokens": 41062566.0, + "step": 1125 + }, + { + "epoch": 0.20909935004642524, + "grad_norm": 1.660600185394287, + "learning_rate": 6.961633663366336e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8393344283103943, + "num_tokens": 41097063.0, + "step": 1126 + }, + { + "epoch": 0.20928505106778086, + "grad_norm": 1.5435981750488281, + "learning_rate": 6.967821782178217e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.8532496690750122, + "num_tokens": 41134434.0, + "step": 1127 + }, + { + "epoch": 0.20947075208913649, + "grad_norm": 1.6116522550582886, + "learning_rate": 6.974009900990099e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8343883156776428, + "num_tokens": 41170101.0, + "step": 1128 + }, + { + "epoch": 0.2096564531104921, + "grad_norm": 1.5387777090072632, + "learning_rate": 6.980198019801979e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.844011664390564, + "num_tokens": 41211208.0, + "step": 1129 + }, + { + "epoch": 0.20984215413184773, + "grad_norm": 1.4532817602157593, + "learning_rate": 6.986386138613861e-07, + "loss": 0.4284, + "mean_token_accuracy": 0.8547865748405457, + "num_tokens": 41250675.0, + "step": 1130 + }, + { + "epoch": 0.21002785515320335, + "grad_norm": 1.6332651376724243, + "learning_rate": 6.992574257425742e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8460873365402222, + "num_tokens": 41289489.0, + "step": 1131 + }, + { + "epoch": 0.21021355617455895, + "grad_norm": 1.7239477634429932, + "learning_rate": 6.998762376237623e-07, + "loss": 0.4304, + "mean_token_accuracy": 0.8560378551483154, + "num_tokens": 41327841.0, + "step": 1132 + }, + { + "epoch": 0.21039925719591457, + "grad_norm": 1.5870417356491089, + "learning_rate": 7.004950495049504e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8436334133148193, + "num_tokens": 41369572.0, + "step": 1133 + }, + { + "epoch": 0.2105849582172702, + "grad_norm": 1.7525774240493774, + "learning_rate": 7.011138613861386e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.8488665819168091, + "num_tokens": 41402703.0, + "step": 1134 + }, + { + "epoch": 0.2107706592386258, + "grad_norm": 1.6728655099868774, + "learning_rate": 7.017326732673267e-07, + "loss": 0.4261, + "mean_token_accuracy": 0.8541867733001709, + "num_tokens": 41438313.0, + "step": 1135 + }, + { + "epoch": 0.21095636025998143, + "grad_norm": 1.7843161821365356, + "learning_rate": 7.023514851485149e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8471292853355408, + "num_tokens": 41469763.0, + "step": 1136 + }, + { + "epoch": 0.21114206128133706, + "grad_norm": 1.6701573133468628, + "learning_rate": 7.029702970297029e-07, + "loss": 0.4125, + "mean_token_accuracy": 0.8633466958999634, + "num_tokens": 41503977.0, + "step": 1137 + }, + { + "epoch": 0.21132776230269265, + "grad_norm": 1.5727640390396118, + "learning_rate": 7.035891089108911e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8530395030975342, + "num_tokens": 41544640.0, + "step": 1138 + }, + { + "epoch": 0.21151346332404827, + "grad_norm": 1.8263779878616333, + "learning_rate": 7.042079207920791e-07, + "loss": 0.4234, + "mean_token_accuracy": 0.8554688692092896, + "num_tokens": 41576786.0, + "step": 1139 + }, + { + "epoch": 0.2116991643454039, + "grad_norm": 1.5482217073440552, + "learning_rate": 7.048267326732673e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8504109978675842, + "num_tokens": 41614918.0, + "step": 1140 + }, + { + "epoch": 0.21188486536675952, + "grad_norm": 1.5614415407180786, + "learning_rate": 7.054455445544554e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.8522484302520752, + "num_tokens": 41654375.0, + "step": 1141 + }, + { + "epoch": 0.21207056638811514, + "grad_norm": 1.635880708694458, + "learning_rate": 7.060643564356435e-07, + "loss": 0.4572, + "mean_token_accuracy": 0.846867024898529, + "num_tokens": 41689532.0, + "step": 1142 + }, + { + "epoch": 0.21225626740947076, + "grad_norm": 1.5710768699645996, + "learning_rate": 7.066831683168316e-07, + "loss": 0.4319, + "mean_token_accuracy": 0.8556655645370483, + "num_tokens": 41731583.0, + "step": 1143 + }, + { + "epoch": 0.21244196843082638, + "grad_norm": 1.6518735885620117, + "learning_rate": 7.073019801980198e-07, + "loss": 0.413, + "mean_token_accuracy": 0.8622019290924072, + "num_tokens": 41766152.0, + "step": 1144 + }, + { + "epoch": 0.21262766945218198, + "grad_norm": 1.4847543239593506, + "learning_rate": 7.079207920792078e-07, + "loss": 0.4271, + "mean_token_accuracy": 0.8577353358268738, + "num_tokens": 41803267.0, + "step": 1145 + }, + { + "epoch": 0.2128133704735376, + "grad_norm": 1.6209901571273804, + "learning_rate": 7.08539603960396e-07, + "loss": 0.4281, + "mean_token_accuracy": 0.8568170070648193, + "num_tokens": 41837371.0, + "step": 1146 + }, + { + "epoch": 0.21299907149489322, + "grad_norm": 1.5361894369125366, + "learning_rate": 7.091584158415841e-07, + "loss": 0.4027, + "mean_token_accuracy": 0.8639551997184753, + "num_tokens": 41875140.0, + "step": 1147 + }, + { + "epoch": 0.21318477251624884, + "grad_norm": 1.4842890501022339, + "learning_rate": 7.097772277227723e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8529068231582642, + "num_tokens": 41914529.0, + "step": 1148 + }, + { + "epoch": 0.21337047353760447, + "grad_norm": 1.6429466009140015, + "learning_rate": 7.103960396039604e-07, + "loss": 0.4103, + "mean_token_accuracy": 0.8629970550537109, + "num_tokens": 41948392.0, + "step": 1149 + }, + { + "epoch": 0.2135561745589601, + "grad_norm": 1.7142976522445679, + "learning_rate": 7.110148514851485e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8399884700775146, + "num_tokens": 41985041.0, + "step": 1150 + }, + { + "epoch": 0.21374187558031568, + "grad_norm": 1.7528088092803955, + "learning_rate": 7.116336633663366e-07, + "loss": 0.437, + "mean_token_accuracy": 0.8524192571640015, + "num_tokens": 42017580.0, + "step": 1151 + }, + { + "epoch": 0.2139275766016713, + "grad_norm": 1.674438238143921, + "learning_rate": 7.122524752475248e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.849231481552124, + "num_tokens": 42053271.0, + "step": 1152 + }, + { + "epoch": 0.21411327762302693, + "grad_norm": 1.4015628099441528, + "learning_rate": 7.128712871287128e-07, + "loss": 0.4234, + "mean_token_accuracy": 0.8578816652297974, + "num_tokens": 42097291.0, + "step": 1153 + }, + { + "epoch": 0.21429897864438255, + "grad_norm": 1.5195322036743164, + "learning_rate": 7.13490099009901e-07, + "loss": 0.3994, + "mean_token_accuracy": 0.8681173920631409, + "num_tokens": 42133032.0, + "step": 1154 + }, + { + "epoch": 0.21448467966573817, + "grad_norm": 1.5497053861618042, + "learning_rate": 7.14108910891089e-07, + "loss": 0.417, + "mean_token_accuracy": 0.8584716320037842, + "num_tokens": 42168521.0, + "step": 1155 + }, + { + "epoch": 0.2146703806870938, + "grad_norm": 1.6110751628875732, + "learning_rate": 7.147277227722772e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8481975793838501, + "num_tokens": 42204777.0, + "step": 1156 + }, + { + "epoch": 0.2148560817084494, + "grad_norm": 1.4235838651657104, + "learning_rate": 7.153465346534653e-07, + "loss": 0.4159, + "mean_token_accuracy": 0.8616176843643188, + "num_tokens": 42248283.0, + "step": 1157 + }, + { + "epoch": 0.215041782729805, + "grad_norm": 1.5802892446517944, + "learning_rate": 7.159653465346534e-07, + "loss": 0.4413, + "mean_token_accuracy": 0.8508563041687012, + "num_tokens": 42284213.0, + "step": 1158 + }, + { + "epoch": 0.21522748375116063, + "grad_norm": 1.6149821281433105, + "learning_rate": 7.165841584158415e-07, + "loss": 0.4178, + "mean_token_accuracy": 0.8615946769714355, + "num_tokens": 42318460.0, + "step": 1159 + }, + { + "epoch": 0.21541318477251625, + "grad_norm": 1.6603065729141235, + "learning_rate": 7.172029702970297e-07, + "loss": 0.4205, + "mean_token_accuracy": 0.8554120063781738, + "num_tokens": 42352916.0, + "step": 1160 + }, + { + "epoch": 0.21559888579387188, + "grad_norm": 1.5950497388839722, + "learning_rate": 7.178217821782178e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8457690477371216, + "num_tokens": 42394001.0, + "step": 1161 + }, + { + "epoch": 0.2157845868152275, + "grad_norm": 1.5158122777938843, + "learning_rate": 7.18440594059406e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.8557332754135132, + "num_tokens": 42432368.0, + "step": 1162 + }, + { + "epoch": 0.2159702878365831, + "grad_norm": 1.560829997062683, + "learning_rate": 7.19059405940594e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.850591242313385, + "num_tokens": 42468099.0, + "step": 1163 + }, + { + "epoch": 0.2161559888579387, + "grad_norm": 1.380778193473816, + "learning_rate": 7.196782178217822e-07, + "loss": 0.407, + "mean_token_accuracy": 0.8634311556816101, + "num_tokens": 42509109.0, + "step": 1164 + }, + { + "epoch": 0.21634168987929434, + "grad_norm": 1.5997074842453003, + "learning_rate": 7.202970297029703e-07, + "loss": 0.4421, + "mean_token_accuracy": 0.8529911041259766, + "num_tokens": 42542810.0, + "step": 1165 + }, + { + "epoch": 0.21652739090064996, + "grad_norm": 1.7386095523834229, + "learning_rate": 7.209158415841584e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8562740683555603, + "num_tokens": 42573424.0, + "step": 1166 + }, + { + "epoch": 0.21671309192200558, + "grad_norm": 1.8001359701156616, + "learning_rate": 7.215346534653465e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8360533118247986, + "num_tokens": 42604057.0, + "step": 1167 + }, + { + "epoch": 0.2168987929433612, + "grad_norm": 1.6744775772094727, + "learning_rate": 7.221534653465346e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.8519056439399719, + "num_tokens": 42638702.0, + "step": 1168 + }, + { + "epoch": 0.2170844939647168, + "grad_norm": 1.7821295261383057, + "learning_rate": 7.227722772277227e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8394291400909424, + "num_tokens": 42668492.0, + "step": 1169 + }, + { + "epoch": 0.21727019498607242, + "grad_norm": 1.658185601234436, + "learning_rate": 7.233910891089109e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8551485538482666, + "num_tokens": 42703247.0, + "step": 1170 + }, + { + "epoch": 0.21745589600742804, + "grad_norm": 1.4856491088867188, + "learning_rate": 7.240099009900989e-07, + "loss": 0.4184, + "mean_token_accuracy": 0.85894376039505, + "num_tokens": 42741095.0, + "step": 1171 + }, + { + "epoch": 0.21764159702878366, + "grad_norm": 1.6426870822906494, + "learning_rate": 7.246287128712871e-07, + "loss": 0.4057, + "mean_token_accuracy": 0.8629447817802429, + "num_tokens": 42776823.0, + "step": 1172 + }, + { + "epoch": 0.21782729805013928, + "grad_norm": 1.6295645236968994, + "learning_rate": 7.252475247524751e-07, + "loss": 0.4122, + "mean_token_accuracy": 0.8632147312164307, + "num_tokens": 42810322.0, + "step": 1173 + }, + { + "epoch": 0.2180129990714949, + "grad_norm": 1.705444097518921, + "learning_rate": 7.258663366336634e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8460301160812378, + "num_tokens": 42844617.0, + "step": 1174 + }, + { + "epoch": 0.2181987000928505, + "grad_norm": 1.6638160943984985, + "learning_rate": 7.264851485148515e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.844089925289154, + "num_tokens": 42881225.0, + "step": 1175 + }, + { + "epoch": 0.21838440111420612, + "grad_norm": 1.6543771028518677, + "learning_rate": 7.271039603960396e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.8478835821151733, + "num_tokens": 42918232.0, + "step": 1176 + }, + { + "epoch": 0.21857010213556174, + "grad_norm": 1.729959487915039, + "learning_rate": 7.277227722772277e-07, + "loss": 0.3989, + "mean_token_accuracy": 0.8669884204864502, + "num_tokens": 42949916.0, + "step": 1177 + }, + { + "epoch": 0.21875580315691737, + "grad_norm": 1.6717592477798462, + "learning_rate": 7.283415841584159e-07, + "loss": 0.4391, + "mean_token_accuracy": 0.8567262887954712, + "num_tokens": 42981549.0, + "step": 1178 + }, + { + "epoch": 0.218941504178273, + "grad_norm": 1.6725260019302368, + "learning_rate": 7.289603960396039e-07, + "loss": 0.4539, + "mean_token_accuracy": 0.8506083488464355, + "num_tokens": 43017335.0, + "step": 1179 + }, + { + "epoch": 0.2191272051996286, + "grad_norm": 1.6832149028778076, + "learning_rate": 7.295792079207921e-07, + "loss": 0.3986, + "mean_token_accuracy": 0.8605070114135742, + "num_tokens": 43048225.0, + "step": 1180 + }, + { + "epoch": 0.2193129062209842, + "grad_norm": 1.5778998136520386, + "learning_rate": 7.301980198019801e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8473857641220093, + "num_tokens": 43085078.0, + "step": 1181 + }, + { + "epoch": 0.21949860724233983, + "grad_norm": 1.481837272644043, + "learning_rate": 7.308168316831683e-07, + "loss": 0.4312, + "mean_token_accuracy": 0.8600885272026062, + "num_tokens": 43126729.0, + "step": 1182 + }, + { + "epoch": 0.21968430826369545, + "grad_norm": 1.5865708589553833, + "learning_rate": 7.314356435643564e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8390740156173706, + "num_tokens": 43166203.0, + "step": 1183 + }, + { + "epoch": 0.21987000928505107, + "grad_norm": 1.594377040863037, + "learning_rate": 7.320544554455445e-07, + "loss": 0.425, + "mean_token_accuracy": 0.8599157333374023, + "num_tokens": 43202377.0, + "step": 1184 + }, + { + "epoch": 0.2200557103064067, + "grad_norm": 1.5021333694458008, + "learning_rate": 7.326732673267326e-07, + "loss": 0.3968, + "mean_token_accuracy": 0.8678241968154907, + "num_tokens": 43243424.0, + "step": 1185 + }, + { + "epoch": 0.22024141132776232, + "grad_norm": 1.5830714702606201, + "learning_rate": 7.332920792079207e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8566552400588989, + "num_tokens": 43278749.0, + "step": 1186 + }, + { + "epoch": 0.2204271123491179, + "grad_norm": 1.6310116052627563, + "learning_rate": 7.339108910891089e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8437738418579102, + "num_tokens": 43316007.0, + "step": 1187 + }, + { + "epoch": 0.22061281337047353, + "grad_norm": 1.5851143598556519, + "learning_rate": 7.345297029702971e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.850861132144928, + "num_tokens": 43353479.0, + "step": 1188 + }, + { + "epoch": 0.22079851439182915, + "grad_norm": 1.4820704460144043, + "learning_rate": 7.351485148514851e-07, + "loss": 0.3759, + "mean_token_accuracy": 0.8728307485580444, + "num_tokens": 43389676.0, + "step": 1189 + }, + { + "epoch": 0.22098421541318478, + "grad_norm": 1.5202641487121582, + "learning_rate": 7.357673267326733e-07, + "loss": 0.4104, + "mean_token_accuracy": 0.8635997176170349, + "num_tokens": 43429829.0, + "step": 1190 + }, + { + "epoch": 0.2211699164345404, + "grad_norm": 1.566296935081482, + "learning_rate": 7.363861386138614e-07, + "loss": 0.3912, + "mean_token_accuracy": 0.8689247369766235, + "num_tokens": 43463211.0, + "step": 1191 + }, + { + "epoch": 0.22135561745589602, + "grad_norm": 1.716131567955017, + "learning_rate": 7.370049504950495e-07, + "loss": 0.4969, + "mean_token_accuracy": 0.8350808620452881, + "num_tokens": 43500827.0, + "step": 1192 + }, + { + "epoch": 0.22154131847725161, + "grad_norm": 1.5725592374801636, + "learning_rate": 7.376237623762376e-07, + "loss": 0.4053, + "mean_token_accuracy": 0.8575105667114258, + "num_tokens": 43533406.0, + "step": 1193 + }, + { + "epoch": 0.22172701949860724, + "grad_norm": 1.5521295070648193, + "learning_rate": 7.382425742574257e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.8581640720367432, + "num_tokens": 43570236.0, + "step": 1194 + }, + { + "epoch": 0.22191272051996286, + "grad_norm": 1.634159803390503, + "learning_rate": 7.388613861386138e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8437697291374207, + "num_tokens": 43605759.0, + "step": 1195 + }, + { + "epoch": 0.22209842154131848, + "grad_norm": 1.528854489326477, + "learning_rate": 7.39480198019802e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8538081645965576, + "num_tokens": 43643151.0, + "step": 1196 + }, + { + "epoch": 0.2222841225626741, + "grad_norm": 1.545552372932434, + "learning_rate": 7.4009900990099e-07, + "loss": 0.3858, + "mean_token_accuracy": 0.8701061606407166, + "num_tokens": 43682225.0, + "step": 1197 + }, + { + "epoch": 0.22246982358402972, + "grad_norm": 1.6278510093688965, + "learning_rate": 7.407178217821782e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8530551195144653, + "num_tokens": 43720809.0, + "step": 1198 + }, + { + "epoch": 0.22265552460538532, + "grad_norm": 1.6365426778793335, + "learning_rate": 7.413366336633662e-07, + "loss": 0.3944, + "mean_token_accuracy": 0.8664301037788391, + "num_tokens": 43756232.0, + "step": 1199 + }, + { + "epoch": 0.22284122562674094, + "grad_norm": 1.7460333108901978, + "learning_rate": 7.419554455445545e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.8379535675048828, + "num_tokens": 43789586.0, + "step": 1200 + }, + { + "epoch": 0.22302692664809656, + "grad_norm": 1.5553396940231323, + "learning_rate": 7.425742574257426e-07, + "loss": 0.4055, + "mean_token_accuracy": 0.864149808883667, + "num_tokens": 43826438.0, + "step": 1201 + }, + { + "epoch": 0.22321262766945218, + "grad_norm": 1.5760250091552734, + "learning_rate": 7.431930693069307e-07, + "loss": 0.4458, + "mean_token_accuracy": 0.8522080183029175, + "num_tokens": 43863453.0, + "step": 1202 + }, + { + "epoch": 0.2233983286908078, + "grad_norm": 1.6687008142471313, + "learning_rate": 7.438118811881188e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.8521745204925537, + "num_tokens": 43898978.0, + "step": 1203 + }, + { + "epoch": 0.22358402971216343, + "grad_norm": 1.4982656240463257, + "learning_rate": 7.44430693069307e-07, + "loss": 0.4472, + "mean_token_accuracy": 0.8497774600982666, + "num_tokens": 43938665.0, + "step": 1204 + }, + { + "epoch": 0.22376973073351902, + "grad_norm": 1.570350170135498, + "learning_rate": 7.45049504950495e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8550247550010681, + "num_tokens": 43980686.0, + "step": 1205 + }, + { + "epoch": 0.22395543175487465, + "grad_norm": 1.5142492055892944, + "learning_rate": 7.456683168316832e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.8463491201400757, + "num_tokens": 44025249.0, + "step": 1206 + }, + { + "epoch": 0.22414113277623027, + "grad_norm": 1.5951130390167236, + "learning_rate": 7.462871287128712e-07, + "loss": 0.4145, + "mean_token_accuracy": 0.8600290417671204, + "num_tokens": 44059171.0, + "step": 1207 + }, + { + "epoch": 0.2243268337975859, + "grad_norm": 1.631240725517273, + "learning_rate": 7.469059405940594e-07, + "loss": 0.42, + "mean_token_accuracy": 0.8622831106185913, + "num_tokens": 44089301.0, + "step": 1208 + }, + { + "epoch": 0.2245125348189415, + "grad_norm": 1.5077160596847534, + "learning_rate": 7.475247524752475e-07, + "loss": 0.3801, + "mean_token_accuracy": 0.8738104104995728, + "num_tokens": 44126321.0, + "step": 1209 + }, + { + "epoch": 0.22469823584029713, + "grad_norm": 1.6536442041397095, + "learning_rate": 7.481435643564356e-07, + "loss": 0.405, + "mean_token_accuracy": 0.8611079454421997, + "num_tokens": 44157081.0, + "step": 1210 + }, + { + "epoch": 0.22488393686165273, + "grad_norm": 1.4851219654083252, + "learning_rate": 7.487623762376237e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.8466308116912842, + "num_tokens": 44197289.0, + "step": 1211 + }, + { + "epoch": 0.22506963788300835, + "grad_norm": 1.5575779676437378, + "learning_rate": 7.493811881188119e-07, + "loss": 0.4031, + "mean_token_accuracy": 0.8602436780929565, + "num_tokens": 44236153.0, + "step": 1212 + }, + { + "epoch": 0.22525533890436397, + "grad_norm": 1.8558496236801147, + "learning_rate": 7.5e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8448315858840942, + "num_tokens": 44264493.0, + "step": 1213 + }, + { + "epoch": 0.2254410399257196, + "grad_norm": 1.7039051055908203, + "learning_rate": 7.506188118811881e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8447630405426025, + "num_tokens": 44300765.0, + "step": 1214 + }, + { + "epoch": 0.22562674094707522, + "grad_norm": 1.559857726097107, + "learning_rate": 7.512376237623762e-07, + "loss": 0.4477, + "mean_token_accuracy": 0.852939784526825, + "num_tokens": 44338511.0, + "step": 1215 + }, + { + "epoch": 0.22581244196843084, + "grad_norm": 1.6263513565063477, + "learning_rate": 7.518564356435643e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.8534374833106995, + "num_tokens": 44372987.0, + "step": 1216 + }, + { + "epoch": 0.22599814298978643, + "grad_norm": 1.7617629766464233, + "learning_rate": 7.524752475247525e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.8542121648788452, + "num_tokens": 44406278.0, + "step": 1217 + }, + { + "epoch": 0.22618384401114205, + "grad_norm": 1.6589804887771606, + "learning_rate": 7.530940594059405e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.8560677766799927, + "num_tokens": 44441789.0, + "step": 1218 + }, + { + "epoch": 0.22636954503249768, + "grad_norm": 1.6855701208114624, + "learning_rate": 7.537128712871287e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.8491514921188354, + "num_tokens": 44476044.0, + "step": 1219 + }, + { + "epoch": 0.2265552460538533, + "grad_norm": 1.6838208436965942, + "learning_rate": 7.543316831683167e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.8459525108337402, + "num_tokens": 44510348.0, + "step": 1220 + }, + { + "epoch": 0.22674094707520892, + "grad_norm": 1.577154278755188, + "learning_rate": 7.549504950495049e-07, + "loss": 0.4121, + "mean_token_accuracy": 0.8612176775932312, + "num_tokens": 44544056.0, + "step": 1221 + }, + { + "epoch": 0.22692664809656454, + "grad_norm": 1.5381969213485718, + "learning_rate": 7.55569306930693e-07, + "loss": 0.4223, + "mean_token_accuracy": 0.8565598130226135, + "num_tokens": 44580664.0, + "step": 1222 + }, + { + "epoch": 0.22711234911792014, + "grad_norm": 1.6464896202087402, + "learning_rate": 7.561881188118811e-07, + "loss": 0.3937, + "mean_token_accuracy": 0.8682312965393066, + "num_tokens": 44615860.0, + "step": 1223 + }, + { + "epoch": 0.22729805013927576, + "grad_norm": 1.512406349182129, + "learning_rate": 7.568069306930692e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8474634885787964, + "num_tokens": 44657312.0, + "step": 1224 + }, + { + "epoch": 0.22748375116063138, + "grad_norm": 1.4758327007293701, + "learning_rate": 7.574257425742574e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8520218133926392, + "num_tokens": 44698056.0, + "step": 1225 + }, + { + "epoch": 0.227669452181987, + "grad_norm": 1.64720618724823, + "learning_rate": 7.580445544554454e-07, + "loss": 0.4211, + "mean_token_accuracy": 0.8581889271736145, + "num_tokens": 44731754.0, + "step": 1226 + }, + { + "epoch": 0.22785515320334263, + "grad_norm": 1.6275520324707031, + "learning_rate": 7.586633663366337e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8462752103805542, + "num_tokens": 44768951.0, + "step": 1227 + }, + { + "epoch": 0.22804085422469825, + "grad_norm": 1.528877854347229, + "learning_rate": 7.592821782178217e-07, + "loss": 0.4265, + "mean_token_accuracy": 0.8587865233421326, + "num_tokens": 44807127.0, + "step": 1228 + }, + { + "epoch": 0.22822655524605384, + "grad_norm": 1.4401637315750122, + "learning_rate": 7.599009900990099e-07, + "loss": 0.4018, + "mean_token_accuracy": 0.8645272850990295, + "num_tokens": 44846237.0, + "step": 1229 + }, + { + "epoch": 0.22841225626740946, + "grad_norm": 1.622001051902771, + "learning_rate": 7.60519801980198e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8391820192337036, + "num_tokens": 44882861.0, + "step": 1230 + }, + { + "epoch": 0.22859795728876509, + "grad_norm": 1.5450513362884521, + "learning_rate": 7.611386138613861e-07, + "loss": 0.3936, + "mean_token_accuracy": 0.8661783933639526, + "num_tokens": 44919345.0, + "step": 1231 + }, + { + "epoch": 0.2287836583101207, + "grad_norm": 1.6035832166671753, + "learning_rate": 7.617574257425742e-07, + "loss": 0.4207, + "mean_token_accuracy": 0.8569719791412354, + "num_tokens": 44955131.0, + "step": 1232 + }, + { + "epoch": 0.22896935933147633, + "grad_norm": 1.5803896188735962, + "learning_rate": 7.623762376237624e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8417491912841797, + "num_tokens": 44991782.0, + "step": 1233 + }, + { + "epoch": 0.22915506035283195, + "grad_norm": 1.6325440406799316, + "learning_rate": 7.629950495049504e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8339570164680481, + "num_tokens": 45030807.0, + "step": 1234 + }, + { + "epoch": 0.22934076137418755, + "grad_norm": 1.6884757280349731, + "learning_rate": 7.636138613861386e-07, + "loss": 0.4186, + "mean_token_accuracy": 0.8592535257339478, + "num_tokens": 45062121.0, + "step": 1235 + }, + { + "epoch": 0.22952646239554317, + "grad_norm": 1.4876008033752441, + "learning_rate": 7.642326732673266e-07, + "loss": 0.4428, + "mean_token_accuracy": 0.8479593992233276, + "num_tokens": 45103522.0, + "step": 1236 + }, + { + "epoch": 0.2297121634168988, + "grad_norm": 1.6150463819503784, + "learning_rate": 7.648514851485148e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.856469988822937, + "num_tokens": 45136751.0, + "step": 1237 + }, + { + "epoch": 0.2298978644382544, + "grad_norm": 1.7115954160690308, + "learning_rate": 7.654702970297029e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8331154584884644, + "num_tokens": 45172155.0, + "step": 1238 + }, + { + "epoch": 0.23008356545961003, + "grad_norm": 1.667059302330017, + "learning_rate": 7.66089108910891e-07, + "loss": 0.4378, + "mean_token_accuracy": 0.8545732498168945, + "num_tokens": 45204541.0, + "step": 1239 + }, + { + "epoch": 0.23026926648096566, + "grad_norm": 1.5373173952102661, + "learning_rate": 7.667079207920792e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8499181270599365, + "num_tokens": 45240474.0, + "step": 1240 + }, + { + "epoch": 0.23045496750232125, + "grad_norm": 1.6153544187545776, + "learning_rate": 7.673267326732673e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8438754677772522, + "num_tokens": 45276518.0, + "step": 1241 + }, + { + "epoch": 0.23064066852367687, + "grad_norm": 1.6260732412338257, + "learning_rate": 7.679455445544554e-07, + "loss": 0.4084, + "mean_token_accuracy": 0.8641057014465332, + "num_tokens": 45309641.0, + "step": 1242 + }, + { + "epoch": 0.2308263695450325, + "grad_norm": 1.5485639572143555, + "learning_rate": 7.685643564356436e-07, + "loss": 0.452, + "mean_token_accuracy": 0.8469557762145996, + "num_tokens": 45349137.0, + "step": 1243 + }, + { + "epoch": 0.23101207056638812, + "grad_norm": 1.5422710180282593, + "learning_rate": 7.691831683168316e-07, + "loss": 0.4249, + "mean_token_accuracy": 0.8571421504020691, + "num_tokens": 45386277.0, + "step": 1244 + }, + { + "epoch": 0.23119777158774374, + "grad_norm": 1.5731918811798096, + "learning_rate": 7.698019801980198e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.8580323457717896, + "num_tokens": 45421994.0, + "step": 1245 + }, + { + "epoch": 0.23138347260909936, + "grad_norm": 1.4393980503082275, + "learning_rate": 7.704207920792079e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.8557884097099304, + "num_tokens": 45465839.0, + "step": 1246 + }, + { + "epoch": 0.23156917363045496, + "grad_norm": 1.6747175455093384, + "learning_rate": 7.71039603960396e-07, + "loss": 0.4765, + "mean_token_accuracy": 0.8395172357559204, + "num_tokens": 45499722.0, + "step": 1247 + }, + { + "epoch": 0.23175487465181058, + "grad_norm": 1.457464337348938, + "learning_rate": 7.716584158415841e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8585302829742432, + "num_tokens": 45540104.0, + "step": 1248 + }, + { + "epoch": 0.2319405756731662, + "grad_norm": 1.8430240154266357, + "learning_rate": 7.722772277227722e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8392161130905151, + "num_tokens": 45575287.0, + "step": 1249 + }, + { + "epoch": 0.23212627669452182, + "grad_norm": 1.5509309768676758, + "learning_rate": 7.728960396039603e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.8446331024169922, + "num_tokens": 45614057.0, + "step": 1250 + }, + { + "epoch": 0.23231197771587744, + "grad_norm": 1.6953952312469482, + "learning_rate": 7.735148514851485e-07, + "loss": 0.4423, + "mean_token_accuracy": 0.8532345294952393, + "num_tokens": 45646174.0, + "step": 1251 + }, + { + "epoch": 0.23249767873723307, + "grad_norm": 1.6393524408340454, + "learning_rate": 7.741336633663365e-07, + "loss": 0.4102, + "mean_token_accuracy": 0.8610299825668335, + "num_tokens": 45680153.0, + "step": 1252 + }, + { + "epoch": 0.23268337975858866, + "grad_norm": 1.7500637769699097, + "learning_rate": 7.747524752475248e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.8474099636077881, + "num_tokens": 45717294.0, + "step": 1253 + }, + { + "epoch": 0.23286908077994428, + "grad_norm": 1.6782424449920654, + "learning_rate": 7.753712871287128e-07, + "loss": 0.4207, + "mean_token_accuracy": 0.8595997095108032, + "num_tokens": 45750712.0, + "step": 1254 + }, + { + "epoch": 0.2330547818012999, + "grad_norm": 1.595218300819397, + "learning_rate": 7.75990099009901e-07, + "loss": 0.4159, + "mean_token_accuracy": 0.8632656335830688, + "num_tokens": 45789024.0, + "step": 1255 + }, + { + "epoch": 0.23324048282265553, + "grad_norm": 1.4902317523956299, + "learning_rate": 7.766089108910891e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.8471255898475647, + "num_tokens": 45828521.0, + "step": 1256 + }, + { + "epoch": 0.23342618384401115, + "grad_norm": 1.707590937614441, + "learning_rate": 7.772277227722772e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8408281803131104, + "num_tokens": 45861413.0, + "step": 1257 + }, + { + "epoch": 0.23361188486536677, + "grad_norm": 1.6510334014892578, + "learning_rate": 7.778465346534653e-07, + "loss": 0.4095, + "mean_token_accuracy": 0.8657094836235046, + "num_tokens": 45894377.0, + "step": 1258 + }, + { + "epoch": 0.23379758588672236, + "grad_norm": 1.7125048637390137, + "learning_rate": 7.784653465346535e-07, + "loss": 0.4156, + "mean_token_accuracy": 0.863103985786438, + "num_tokens": 45929244.0, + "step": 1259 + }, + { + "epoch": 0.233983286908078, + "grad_norm": 1.7034157514572144, + "learning_rate": 7.790841584158415e-07, + "loss": 0.4325, + "mean_token_accuracy": 0.8502476215362549, + "num_tokens": 45963138.0, + "step": 1260 + }, + { + "epoch": 0.2341689879294336, + "grad_norm": 1.6118923425674438, + "learning_rate": 7.797029702970297e-07, + "loss": 0.4395, + "mean_token_accuracy": 0.8598674535751343, + "num_tokens": 46000639.0, + "step": 1261 + }, + { + "epoch": 0.23435468895078923, + "grad_norm": 1.7114293575286865, + "learning_rate": 7.803217821782177e-07, + "loss": 0.4175, + "mean_token_accuracy": 0.8618795871734619, + "num_tokens": 46030780.0, + "step": 1262 + }, + { + "epoch": 0.23454038997214485, + "grad_norm": 1.651335597038269, + "learning_rate": 7.809405940594059e-07, + "loss": 0.4063, + "mean_token_accuracy": 0.859428346157074, + "num_tokens": 46064312.0, + "step": 1263 + }, + { + "epoch": 0.23472609099350047, + "grad_norm": 1.5997430086135864, + "learning_rate": 7.81559405940594e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8471482992172241, + "num_tokens": 46101056.0, + "step": 1264 + }, + { + "epoch": 0.23491179201485607, + "grad_norm": 1.4935975074768066, + "learning_rate": 7.821782178217821e-07, + "loss": 0.401, + "mean_token_accuracy": 0.8700084090232849, + "num_tokens": 46137228.0, + "step": 1265 + }, + { + "epoch": 0.2350974930362117, + "grad_norm": 1.4035495519638062, + "learning_rate": 7.827970297029702e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8570044040679932, + "num_tokens": 46181707.0, + "step": 1266 + }, + { + "epoch": 0.2352831940575673, + "grad_norm": 1.5885647535324097, + "learning_rate": 7.834158415841585e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.8540343046188354, + "num_tokens": 46220222.0, + "step": 1267 + }, + { + "epoch": 0.23546889507892294, + "grad_norm": 1.566226601600647, + "learning_rate": 7.840346534653465e-07, + "loss": 0.4116, + "mean_token_accuracy": 0.8617403507232666, + "num_tokens": 46259180.0, + "step": 1268 + }, + { + "epoch": 0.23565459610027856, + "grad_norm": 1.5368942022323608, + "learning_rate": 7.846534653465347e-07, + "loss": 0.4445, + "mean_token_accuracy": 0.8496439456939697, + "num_tokens": 46298276.0, + "step": 1269 + }, + { + "epoch": 0.23584029712163418, + "grad_norm": 1.484374761581421, + "learning_rate": 7.852722772277227e-07, + "loss": 0.455, + "mean_token_accuracy": 0.8496726155281067, + "num_tokens": 46341119.0, + "step": 1270 + }, + { + "epoch": 0.23602599814298977, + "grad_norm": 1.7399314641952515, + "learning_rate": 7.858910891089109e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8524767160415649, + "num_tokens": 46371365.0, + "step": 1271 + }, + { + "epoch": 0.2362116991643454, + "grad_norm": 1.602038860321045, + "learning_rate": 7.86509900990099e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8334801197052002, + "num_tokens": 46410699.0, + "step": 1272 + }, + { + "epoch": 0.23639740018570102, + "grad_norm": 1.5373828411102295, + "learning_rate": 7.871287128712871e-07, + "loss": 0.4013, + "mean_token_accuracy": 0.8644568920135498, + "num_tokens": 46449068.0, + "step": 1273 + }, + { + "epoch": 0.23658310120705664, + "grad_norm": 1.6027145385742188, + "learning_rate": 7.877475247524752e-07, + "loss": 0.452, + "mean_token_accuracy": 0.8494541645050049, + "num_tokens": 46483515.0, + "step": 1274 + }, + { + "epoch": 0.23676880222841226, + "grad_norm": 1.6209856271743774, + "learning_rate": 7.883663366336633e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8482459783554077, + "num_tokens": 46518307.0, + "step": 1275 + }, + { + "epoch": 0.23695450324976788, + "grad_norm": 1.5084054470062256, + "learning_rate": 7.889851485148514e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.8577027320861816, + "num_tokens": 46561154.0, + "step": 1276 + }, + { + "epoch": 0.23714020427112348, + "grad_norm": 1.5252865552902222, + "learning_rate": 7.896039603960396e-07, + "loss": 0.4332, + "mean_token_accuracy": 0.8567045331001282, + "num_tokens": 46597527.0, + "step": 1277 + }, + { + "epoch": 0.2373259052924791, + "grad_norm": 1.7042572498321533, + "learning_rate": 7.902227722772276e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.8515785932540894, + "num_tokens": 46631487.0, + "step": 1278 + }, + { + "epoch": 0.23751160631383472, + "grad_norm": 1.8029979467391968, + "learning_rate": 7.908415841584158e-07, + "loss": 0.3746, + "mean_token_accuracy": 0.8708419799804688, + "num_tokens": 46659939.0, + "step": 1279 + }, + { + "epoch": 0.23769730733519034, + "grad_norm": 1.460963249206543, + "learning_rate": 7.91460396039604e-07, + "loss": 0.4098, + "mean_token_accuracy": 0.8645992279052734, + "num_tokens": 46699530.0, + "step": 1280 + }, + { + "epoch": 0.23788300835654597, + "grad_norm": 1.5628623962402344, + "learning_rate": 7.920792079207921e-07, + "loss": 0.4199, + "mean_token_accuracy": 0.8573300838470459, + "num_tokens": 46732989.0, + "step": 1281 + }, + { + "epoch": 0.2380687093779016, + "grad_norm": 1.618773102760315, + "learning_rate": 7.926980198019802e-07, + "loss": 0.4298, + "mean_token_accuracy": 0.8526486158370972, + "num_tokens": 46767519.0, + "step": 1282 + }, + { + "epoch": 0.23825441039925718, + "grad_norm": 1.6517784595489502, + "learning_rate": 7.933168316831683e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8379698991775513, + "num_tokens": 46803333.0, + "step": 1283 + }, + { + "epoch": 0.2384401114206128, + "grad_norm": 1.6541084051132202, + "learning_rate": 7.939356435643564e-07, + "loss": 0.424, + "mean_token_accuracy": 0.8512810468673706, + "num_tokens": 46837442.0, + "step": 1284 + }, + { + "epoch": 0.23862581244196843, + "grad_norm": 1.6336524486541748, + "learning_rate": 7.945544554455446e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8434302806854248, + "num_tokens": 46873492.0, + "step": 1285 + }, + { + "epoch": 0.23881151346332405, + "grad_norm": 1.8293911218643188, + "learning_rate": 7.951732673267326e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8438313007354736, + "num_tokens": 46907411.0, + "step": 1286 + }, + { + "epoch": 0.23899721448467967, + "grad_norm": 1.683242917060852, + "learning_rate": 7.957920792079208e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8488483428955078, + "num_tokens": 46939349.0, + "step": 1287 + }, + { + "epoch": 0.2391829155060353, + "grad_norm": 1.6928426027297974, + "learning_rate": 7.964108910891088e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.8415033221244812, + "num_tokens": 46973899.0, + "step": 1288 + }, + { + "epoch": 0.2393686165273909, + "grad_norm": 1.583524465560913, + "learning_rate": 7.97029702970297e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8469568490982056, + "num_tokens": 47010839.0, + "step": 1289 + }, + { + "epoch": 0.2395543175487465, + "grad_norm": 1.6295958757400513, + "learning_rate": 7.976485148514851e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.8568522930145264, + "num_tokens": 47043303.0, + "step": 1290 + }, + { + "epoch": 0.23974001857010213, + "grad_norm": 1.552667260169983, + "learning_rate": 7.982673267326732e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.850842297077179, + "num_tokens": 47080987.0, + "step": 1291 + }, + { + "epoch": 0.23992571959145775, + "grad_norm": 1.5996124744415283, + "learning_rate": 7.988861386138613e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8452770709991455, + "num_tokens": 47116929.0, + "step": 1292 + }, + { + "epoch": 0.24011142061281338, + "grad_norm": 1.5442291498184204, + "learning_rate": 7.995049504950496e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8354535698890686, + "num_tokens": 47158416.0, + "step": 1293 + }, + { + "epoch": 0.240297121634169, + "grad_norm": 1.6959112882614136, + "learning_rate": 8.001237623762376e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8508318662643433, + "num_tokens": 47194119.0, + "step": 1294 + }, + { + "epoch": 0.2404828226555246, + "grad_norm": 1.4476572275161743, + "learning_rate": 8.007425742574258e-07, + "loss": 0.4062, + "mean_token_accuracy": 0.8633317947387695, + "num_tokens": 47234580.0, + "step": 1295 + }, + { + "epoch": 0.2406685236768802, + "grad_norm": 1.5067355632781982, + "learning_rate": 8.013613861386138e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.855538547039032, + "num_tokens": 47275721.0, + "step": 1296 + }, + { + "epoch": 0.24085422469823584, + "grad_norm": 1.4818871021270752, + "learning_rate": 8.01980198019802e-07, + "loss": 0.4082, + "mean_token_accuracy": 0.8635349869728088, + "num_tokens": 47316227.0, + "step": 1297 + }, + { + "epoch": 0.24103992571959146, + "grad_norm": 1.6030892133712769, + "learning_rate": 8.025990099009901e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8499743342399597, + "num_tokens": 47351742.0, + "step": 1298 + }, + { + "epoch": 0.24122562674094708, + "grad_norm": 1.5614105463027954, + "learning_rate": 8.032178217821782e-07, + "loss": 0.453, + "mean_token_accuracy": 0.850068986415863, + "num_tokens": 47389715.0, + "step": 1299 + }, + { + "epoch": 0.2414113277623027, + "grad_norm": 1.6988779306411743, + "learning_rate": 8.038366336633663e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.8471062183380127, + "num_tokens": 47422045.0, + "step": 1300 + }, + { + "epoch": 0.2415970287836583, + "grad_norm": 1.5705723762512207, + "learning_rate": 8.044554455445544e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.8360376358032227, + "num_tokens": 47461771.0, + "step": 1301 + }, + { + "epoch": 0.24178272980501392, + "grad_norm": 1.5858358144760132, + "learning_rate": 8.050742574257425e-07, + "loss": 0.4416, + "mean_token_accuracy": 0.8503433465957642, + "num_tokens": 47500545.0, + "step": 1302 + }, + { + "epoch": 0.24196843082636954, + "grad_norm": 1.5083377361297607, + "learning_rate": 8.056930693069307e-07, + "loss": 0.4154, + "mean_token_accuracy": 0.8621847629547119, + "num_tokens": 47539451.0, + "step": 1303 + }, + { + "epoch": 0.24215413184772516, + "grad_norm": 1.5914530754089355, + "learning_rate": 8.063118811881187e-07, + "loss": 0.45, + "mean_token_accuracy": 0.8517868518829346, + "num_tokens": 47578207.0, + "step": 1304 + }, + { + "epoch": 0.24233983286908078, + "grad_norm": 1.57510507106781, + "learning_rate": 8.069306930693069e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.8316490650177002, + "num_tokens": 47620216.0, + "step": 1305 + }, + { + "epoch": 0.2425255338904364, + "grad_norm": 1.6529909372329712, + "learning_rate": 8.075495049504951e-07, + "loss": 0.4195, + "mean_token_accuracy": 0.8596919178962708, + "num_tokens": 47652750.0, + "step": 1306 + }, + { + "epoch": 0.24271123491179203, + "grad_norm": 1.4805028438568115, + "learning_rate": 8.081683168316832e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.861433744430542, + "num_tokens": 47695409.0, + "step": 1307 + }, + { + "epoch": 0.24289693593314762, + "grad_norm": 1.5485358238220215, + "learning_rate": 8.087871287128713e-07, + "loss": 0.3815, + "mean_token_accuracy": 0.8719822764396667, + "num_tokens": 47730184.0, + "step": 1308 + }, + { + "epoch": 0.24308263695450325, + "grad_norm": 1.5270098447799683, + "learning_rate": 8.094059405940594e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.855640172958374, + "num_tokens": 47768996.0, + "step": 1309 + }, + { + "epoch": 0.24326833797585887, + "grad_norm": 1.560637354850769, + "learning_rate": 8.100247524752475e-07, + "loss": 0.4656, + "mean_token_accuracy": 0.8479143381118774, + "num_tokens": 47805205.0, + "step": 1310 + }, + { + "epoch": 0.2434540389972145, + "grad_norm": 1.492758870124817, + "learning_rate": 8.106435643564357e-07, + "loss": 0.4266, + "mean_token_accuracy": 0.8583084344863892, + "num_tokens": 47843909.0, + "step": 1311 + }, + { + "epoch": 0.2436397400185701, + "grad_norm": 1.5594267845153809, + "learning_rate": 8.112623762376237e-07, + "loss": 0.4199, + "mean_token_accuracy": 0.8619476556777954, + "num_tokens": 47884067.0, + "step": 1312 + }, + { + "epoch": 0.24382544103992573, + "grad_norm": 1.475476622581482, + "learning_rate": 8.118811881188119e-07, + "loss": 0.4539, + "mean_token_accuracy": 0.8510228395462036, + "num_tokens": 47925363.0, + "step": 1313 + }, + { + "epoch": 0.24401114206128133, + "grad_norm": 1.5863111019134521, + "learning_rate": 8.125e-07, + "loss": 0.399, + "mean_token_accuracy": 0.8622640371322632, + "num_tokens": 47960833.0, + "step": 1314 + }, + { + "epoch": 0.24419684308263695, + "grad_norm": 1.6821439266204834, + "learning_rate": 8.13118811881188e-07, + "loss": 0.4454, + "mean_token_accuracy": 0.8542078733444214, + "num_tokens": 47995948.0, + "step": 1315 + }, + { + "epoch": 0.24438254410399257, + "grad_norm": 1.758083462715149, + "learning_rate": 8.137376237623762e-07, + "loss": 0.3947, + "mean_token_accuracy": 0.8686693906784058, + "num_tokens": 48025911.0, + "step": 1316 + }, + { + "epoch": 0.2445682451253482, + "grad_norm": 1.7385812997817993, + "learning_rate": 8.143564356435642e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8484978675842285, + "num_tokens": 48056992.0, + "step": 1317 + }, + { + "epoch": 0.24475394614670382, + "grad_norm": 1.9741363525390625, + "learning_rate": 8.149752475247524e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.8437051773071289, + "num_tokens": 48083937.0, + "step": 1318 + }, + { + "epoch": 0.24493964716805944, + "grad_norm": 1.5975474119186401, + "learning_rate": 8.155940594059405e-07, + "loss": 0.4145, + "mean_token_accuracy": 0.8601435422897339, + "num_tokens": 48121110.0, + "step": 1319 + }, + { + "epoch": 0.24512534818941503, + "grad_norm": 1.5846267938613892, + "learning_rate": 8.162128712871287e-07, + "loss": 0.3599, + "mean_token_accuracy": 0.8765318393707275, + "num_tokens": 48150451.0, + "step": 1320 + }, + { + "epoch": 0.24531104921077065, + "grad_norm": 1.5401787757873535, + "learning_rate": 8.168316831683168e-07, + "loss": 0.4416, + "mean_token_accuracy": 0.8525941967964172, + "num_tokens": 48187716.0, + "step": 1321 + }, + { + "epoch": 0.24549675023212628, + "grad_norm": 1.529591679573059, + "learning_rate": 8.174504950495049e-07, + "loss": 0.4243, + "mean_token_accuracy": 0.8576046228408813, + "num_tokens": 48227081.0, + "step": 1322 + }, + { + "epoch": 0.2456824512534819, + "grad_norm": 1.4567936658859253, + "learning_rate": 8.18069306930693e-07, + "loss": 0.407, + "mean_token_accuracy": 0.8604289293289185, + "num_tokens": 48265727.0, + "step": 1323 + }, + { + "epoch": 0.24586815227483752, + "grad_norm": 1.578122854232788, + "learning_rate": 8.186881188118812e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8428177237510681, + "num_tokens": 48304730.0, + "step": 1324 + }, + { + "epoch": 0.24605385329619314, + "grad_norm": 1.7248469591140747, + "learning_rate": 8.193069306930692e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.845350980758667, + "num_tokens": 48337556.0, + "step": 1325 + }, + { + "epoch": 0.24623955431754874, + "grad_norm": 1.52671217918396, + "learning_rate": 8.199257425742574e-07, + "loss": 0.408, + "mean_token_accuracy": 0.8595981001853943, + "num_tokens": 48376629.0, + "step": 1326 + }, + { + "epoch": 0.24642525533890436, + "grad_norm": 1.5511305332183838, + "learning_rate": 8.205445544554455e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.848114013671875, + "num_tokens": 48413786.0, + "step": 1327 + }, + { + "epoch": 0.24661095636025998, + "grad_norm": 1.5218605995178223, + "learning_rate": 8.211633663366336e-07, + "loss": 0.362, + "mean_token_accuracy": 0.8768779635429382, + "num_tokens": 48448213.0, + "step": 1328 + }, + { + "epoch": 0.2467966573816156, + "grad_norm": 1.528771996498108, + "learning_rate": 8.217821782178217e-07, + "loss": 0.4286, + "mean_token_accuracy": 0.855021595954895, + "num_tokens": 48486153.0, + "step": 1329 + }, + { + "epoch": 0.24698235840297122, + "grad_norm": 1.5609239339828491, + "learning_rate": 8.224009900990098e-07, + "loss": 0.4314, + "mean_token_accuracy": 0.8565114736557007, + "num_tokens": 48524095.0, + "step": 1330 + }, + { + "epoch": 0.24716805942432685, + "grad_norm": 1.6438202857971191, + "learning_rate": 8.230198019801979e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.8412083387374878, + "num_tokens": 48563548.0, + "step": 1331 + }, + { + "epoch": 0.24735376044568244, + "grad_norm": 1.5586292743682861, + "learning_rate": 8.236386138613861e-07, + "loss": 0.4002, + "mean_token_accuracy": 0.8612982630729675, + "num_tokens": 48600935.0, + "step": 1332 + }, + { + "epoch": 0.24753946146703806, + "grad_norm": 1.6859983205795288, + "learning_rate": 8.242574257425742e-07, + "loss": 0.479, + "mean_token_accuracy": 0.841913104057312, + "num_tokens": 48638697.0, + "step": 1333 + }, + { + "epoch": 0.24772516248839369, + "grad_norm": 1.700461745262146, + "learning_rate": 8.248762376237624e-07, + "loss": 0.4459, + "mean_token_accuracy": 0.8477252721786499, + "num_tokens": 48669950.0, + "step": 1334 + }, + { + "epoch": 0.2479108635097493, + "grad_norm": 1.72792387008667, + "learning_rate": 8.254950495049504e-07, + "loss": 0.4171, + "mean_token_accuracy": 0.8601878881454468, + "num_tokens": 48700396.0, + "step": 1335 + }, + { + "epoch": 0.24809656453110493, + "grad_norm": 1.6497979164123535, + "learning_rate": 8.261138613861386e-07, + "loss": 0.4249, + "mean_token_accuracy": 0.86159747838974, + "num_tokens": 48735528.0, + "step": 1336 + }, + { + "epoch": 0.24828226555246055, + "grad_norm": 1.5895613431930542, + "learning_rate": 8.267326732673267e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8572161793708801, + "num_tokens": 48774285.0, + "step": 1337 + }, + { + "epoch": 0.24846796657381615, + "grad_norm": 1.4887138605117798, + "learning_rate": 8.273514851485148e-07, + "loss": 0.3938, + "mean_token_accuracy": 0.8646809458732605, + "num_tokens": 48810001.0, + "step": 1338 + }, + { + "epoch": 0.24865366759517177, + "grad_norm": 1.740491271018982, + "learning_rate": 8.279702970297029e-07, + "loss": 0.4315, + "mean_token_accuracy": 0.8593274354934692, + "num_tokens": 48842365.0, + "step": 1339 + }, + { + "epoch": 0.2488393686165274, + "grad_norm": 2.2227139472961426, + "learning_rate": 8.285891089108911e-07, + "loss": 0.419, + "mean_token_accuracy": 0.860312819480896, + "num_tokens": 48877033.0, + "step": 1340 + }, + { + "epoch": 0.249025069637883, + "grad_norm": 1.5534486770629883, + "learning_rate": 8.292079207920791e-07, + "loss": 0.3764, + "mean_token_accuracy": 0.8712211847305298, + "num_tokens": 48914224.0, + "step": 1341 + }, + { + "epoch": 0.24921077065923863, + "grad_norm": 1.6389607191085815, + "learning_rate": 8.298267326732673e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8500980734825134, + "num_tokens": 48952016.0, + "step": 1342 + }, + { + "epoch": 0.24939647168059426, + "grad_norm": 1.6236923933029175, + "learning_rate": 8.304455445544553e-07, + "loss": 0.4022, + "mean_token_accuracy": 0.8653227090835571, + "num_tokens": 48986283.0, + "step": 1343 + }, + { + "epoch": 0.24958217270194985, + "grad_norm": 1.6301898956298828, + "learning_rate": 8.310643564356435e-07, + "loss": 0.452, + "mean_token_accuracy": 0.8494899272918701, + "num_tokens": 49020608.0, + "step": 1344 + }, + { + "epoch": 0.24976787372330547, + "grad_norm": 1.5218396186828613, + "learning_rate": 8.316831683168316e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.8556980490684509, + "num_tokens": 49058836.0, + "step": 1345 + }, + { + "epoch": 0.2499535747446611, + "grad_norm": 1.5420314073562622, + "learning_rate": 8.323019801980198e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8577769994735718, + "num_tokens": 49100225.0, + "step": 1346 + }, + { + "epoch": 0.2501392757660167, + "grad_norm": 1.7501839399337769, + "learning_rate": 8.329207920792079e-07, + "loss": 0.4333, + "mean_token_accuracy": 0.8531010150909424, + "num_tokens": 49139130.0, + "step": 1347 + }, + { + "epoch": 0.2503249767873723, + "grad_norm": 1.6430583000183105, + "learning_rate": 8.33539603960396e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8467631340026855, + "num_tokens": 49174800.0, + "step": 1348 + }, + { + "epoch": 0.25051067780872793, + "grad_norm": 1.6979193687438965, + "learning_rate": 8.341584158415841e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8475397229194641, + "num_tokens": 49205963.0, + "step": 1349 + }, + { + "epoch": 0.25069637883008355, + "grad_norm": 1.6622930765151978, + "learning_rate": 8.347772277227723e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.848641037940979, + "num_tokens": 49238556.0, + "step": 1350 + }, + { + "epoch": 0.2508820798514392, + "grad_norm": 1.5443806648254395, + "learning_rate": 8.353960396039603e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8486315608024597, + "num_tokens": 49279895.0, + "step": 1351 + }, + { + "epoch": 0.2510677808727948, + "grad_norm": 1.7265182733535767, + "learning_rate": 8.360148514851485e-07, + "loss": 0.4481, + "mean_token_accuracy": 0.8511555194854736, + "num_tokens": 49312893.0, + "step": 1352 + }, + { + "epoch": 0.2512534818941504, + "grad_norm": 1.7170329093933105, + "learning_rate": 8.366336633663366e-07, + "loss": 0.4558, + "mean_token_accuracy": 0.8522385358810425, + "num_tokens": 49349400.0, + "step": 1353 + }, + { + "epoch": 0.25143918291550604, + "grad_norm": 1.4951162338256836, + "learning_rate": 8.372524752475247e-07, + "loss": 0.377, + "mean_token_accuracy": 0.8709671497344971, + "num_tokens": 49385158.0, + "step": 1354 + }, + { + "epoch": 0.25162488393686167, + "grad_norm": 1.496859073638916, + "learning_rate": 8.378712871287128e-07, + "loss": 0.447, + "mean_token_accuracy": 0.848982572555542, + "num_tokens": 49427685.0, + "step": 1355 + }, + { + "epoch": 0.2518105849582173, + "grad_norm": 1.692867636680603, + "learning_rate": 8.384900990099009e-07, + "loss": 0.4311, + "mean_token_accuracy": 0.8570353984832764, + "num_tokens": 49461287.0, + "step": 1356 + }, + { + "epoch": 0.2519962859795729, + "grad_norm": 1.4059737920761108, + "learning_rate": 8.39108910891089e-07, + "loss": 0.393, + "mean_token_accuracy": 0.8655477166175842, + "num_tokens": 49502462.0, + "step": 1357 + }, + { + "epoch": 0.25218198700092853, + "grad_norm": 1.73572838306427, + "learning_rate": 8.397277227722772e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.8607701063156128, + "num_tokens": 49532774.0, + "step": 1358 + }, + { + "epoch": 0.2523676880222841, + "grad_norm": 1.5882705450057983, + "learning_rate": 8.403465346534653e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.8504601120948792, + "num_tokens": 49569122.0, + "step": 1359 + }, + { + "epoch": 0.2525533890436397, + "grad_norm": 1.5698966979980469, + "learning_rate": 8.409653465346535e-07, + "loss": 0.4512, + "mean_token_accuracy": 0.8499926328659058, + "num_tokens": 49606384.0, + "step": 1360 + }, + { + "epoch": 0.25273909006499534, + "grad_norm": 1.5761067867279053, + "learning_rate": 8.415841584158416e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8531187772750854, + "num_tokens": 49644239.0, + "step": 1361 + }, + { + "epoch": 0.25292479108635096, + "grad_norm": 1.7352968454360962, + "learning_rate": 8.422029702970297e-07, + "loss": 0.4584, + "mean_token_accuracy": 0.8464975357055664, + "num_tokens": 49676987.0, + "step": 1362 + }, + { + "epoch": 0.2531104921077066, + "grad_norm": 1.5027785301208496, + "learning_rate": 8.428217821782178e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.8520644903182983, + "num_tokens": 49714731.0, + "step": 1363 + }, + { + "epoch": 0.2532961931290622, + "grad_norm": 1.4848121404647827, + "learning_rate": 8.434405940594059e-07, + "loss": 0.4408, + "mean_token_accuracy": 0.851230263710022, + "num_tokens": 49755828.0, + "step": 1364 + }, + { + "epoch": 0.25348189415041783, + "grad_norm": 1.4223154783248901, + "learning_rate": 8.44059405940594e-07, + "loss": 0.4093, + "mean_token_accuracy": 0.8603954315185547, + "num_tokens": 49796895.0, + "step": 1365 + }, + { + "epoch": 0.25366759517177345, + "grad_norm": 1.6616696119308472, + "learning_rate": 8.446782178217822e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8433430194854736, + "num_tokens": 49829772.0, + "step": 1366 + }, + { + "epoch": 0.2538532961931291, + "grad_norm": 1.5180819034576416, + "learning_rate": 8.452970297029702e-07, + "loss": 0.3936, + "mean_token_accuracy": 0.8691152930259705, + "num_tokens": 49864799.0, + "step": 1367 + }, + { + "epoch": 0.2540389972144847, + "grad_norm": 1.5978649854660034, + "learning_rate": 8.459158415841584e-07, + "loss": 0.4177, + "mean_token_accuracy": 0.8574252128601074, + "num_tokens": 49900323.0, + "step": 1368 + }, + { + "epoch": 0.2542246982358403, + "grad_norm": 1.5019209384918213, + "learning_rate": 8.465346534653464e-07, + "loss": 0.4066, + "mean_token_accuracy": 0.8636977672576904, + "num_tokens": 49937881.0, + "step": 1369 + }, + { + "epoch": 0.25441039925719594, + "grad_norm": 1.4436320066452026, + "learning_rate": 8.471534653465346e-07, + "loss": 0.3693, + "mean_token_accuracy": 0.8750808835029602, + "num_tokens": 49978627.0, + "step": 1370 + }, + { + "epoch": 0.2545961002785515, + "grad_norm": 1.676856279373169, + "learning_rate": 8.477722772277227e-07, + "loss": 0.4295, + "mean_token_accuracy": 0.8539685010910034, + "num_tokens": 50011692.0, + "step": 1371 + }, + { + "epoch": 0.25478180129990713, + "grad_norm": 1.5993773937225342, + "learning_rate": 8.483910891089109e-07, + "loss": 0.4186, + "mean_token_accuracy": 0.8583793044090271, + "num_tokens": 50045022.0, + "step": 1372 + }, + { + "epoch": 0.25496750232126275, + "grad_norm": 1.6576414108276367, + "learning_rate": 8.49009900990099e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.8480189442634583, + "num_tokens": 50078043.0, + "step": 1373 + }, + { + "epoch": 0.2551532033426184, + "grad_norm": 1.546169638633728, + "learning_rate": 8.496287128712872e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8360391855239868, + "num_tokens": 50118368.0, + "step": 1374 + }, + { + "epoch": 0.255338904363974, + "grad_norm": 1.5683832168579102, + "learning_rate": 8.502475247524752e-07, + "loss": 0.411, + "mean_token_accuracy": 0.86374431848526, + "num_tokens": 50153194.0, + "step": 1375 + }, + { + "epoch": 0.2555246053853296, + "grad_norm": 1.614652156829834, + "learning_rate": 8.508663366336634e-07, + "loss": 0.3957, + "mean_token_accuracy": 0.8649320006370544, + "num_tokens": 50185149.0, + "step": 1376 + }, + { + "epoch": 0.25571030640668524, + "grad_norm": 1.5665627717971802, + "learning_rate": 8.514851485148514e-07, + "loss": 0.4315, + "mean_token_accuracy": 0.8569232821464539, + "num_tokens": 50222042.0, + "step": 1377 + }, + { + "epoch": 0.25589600742804086, + "grad_norm": 1.5743869543075562, + "learning_rate": 8.521039603960396e-07, + "loss": 0.4271, + "mean_token_accuracy": 0.8568226099014282, + "num_tokens": 50257086.0, + "step": 1378 + }, + { + "epoch": 0.2560817084493965, + "grad_norm": 1.5789387226104736, + "learning_rate": 8.527227722772277e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8482478857040405, + "num_tokens": 50290443.0, + "step": 1379 + }, + { + "epoch": 0.2562674094707521, + "grad_norm": 1.6539350748062134, + "learning_rate": 8.533415841584158e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8386718034744263, + "num_tokens": 50326331.0, + "step": 1380 + }, + { + "epoch": 0.2564531104921077, + "grad_norm": 1.6067421436309814, + "learning_rate": 8.539603960396039e-07, + "loss": 0.4125, + "mean_token_accuracy": 0.8639679551124573, + "num_tokens": 50363439.0, + "step": 1381 + }, + { + "epoch": 0.25663881151346335, + "grad_norm": 1.4977715015411377, + "learning_rate": 8.54579207920792e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8588752746582031, + "num_tokens": 50403210.0, + "step": 1382 + }, + { + "epoch": 0.2568245125348189, + "grad_norm": 1.8000407218933105, + "learning_rate": 8.551980198019801e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8398136496543884, + "num_tokens": 50436325.0, + "step": 1383 + }, + { + "epoch": 0.25701021355617454, + "grad_norm": 1.608384609222412, + "learning_rate": 8.558168316831683e-07, + "loss": 0.419, + "mean_token_accuracy": 0.8604279160499573, + "num_tokens": 50474332.0, + "step": 1384 + }, + { + "epoch": 0.25719591457753016, + "grad_norm": 1.6944692134857178, + "learning_rate": 8.564356435643563e-07, + "loss": 0.412, + "mean_token_accuracy": 0.8653052449226379, + "num_tokens": 50507811.0, + "step": 1385 + }, + { + "epoch": 0.2573816155988858, + "grad_norm": 1.7332706451416016, + "learning_rate": 8.570544554455446e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8523371815681458, + "num_tokens": 50539578.0, + "step": 1386 + }, + { + "epoch": 0.2575673166202414, + "grad_norm": 1.5970979928970337, + "learning_rate": 8.576732673267327e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8490327596664429, + "num_tokens": 50576769.0, + "step": 1387 + }, + { + "epoch": 0.257753017641597, + "grad_norm": 1.6942436695098877, + "learning_rate": 8.582920792079208e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.8565186262130737, + "num_tokens": 50609013.0, + "step": 1388 + }, + { + "epoch": 0.25793871866295265, + "grad_norm": 1.6625317335128784, + "learning_rate": 8.589108910891089e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.845511794090271, + "num_tokens": 50644044.0, + "step": 1389 + }, + { + "epoch": 0.25812441968430827, + "grad_norm": 1.627289891242981, + "learning_rate": 8.59529702970297e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8512254953384399, + "num_tokens": 50679386.0, + "step": 1390 + }, + { + "epoch": 0.2583101207056639, + "grad_norm": 1.5475488901138306, + "learning_rate": 8.601485148514851e-07, + "loss": 0.4058, + "mean_token_accuracy": 0.8648808002471924, + "num_tokens": 50713744.0, + "step": 1391 + }, + { + "epoch": 0.2584958217270195, + "grad_norm": 1.5964009761810303, + "learning_rate": 8.607673267326733e-07, + "loss": 0.399, + "mean_token_accuracy": 0.8666311502456665, + "num_tokens": 50751224.0, + "step": 1392 + }, + { + "epoch": 0.25868152274837514, + "grad_norm": 1.522673487663269, + "learning_rate": 8.613861386138613e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8485383987426758, + "num_tokens": 50791078.0, + "step": 1393 + }, + { + "epoch": 0.25886722376973076, + "grad_norm": 1.5478638410568237, + "learning_rate": 8.620049504950495e-07, + "loss": 0.43, + "mean_token_accuracy": 0.8527531623840332, + "num_tokens": 50828573.0, + "step": 1394 + }, + { + "epoch": 0.2590529247910863, + "grad_norm": 1.6780508756637573, + "learning_rate": 8.626237623762375e-07, + "loss": 0.4266, + "mean_token_accuracy": 0.8554372191429138, + "num_tokens": 50861366.0, + "step": 1395 + }, + { + "epoch": 0.25923862581244195, + "grad_norm": 1.6210957765579224, + "learning_rate": 8.632425742574257e-07, + "loss": 0.4217, + "mean_token_accuracy": 0.8586009740829468, + "num_tokens": 50896633.0, + "step": 1396 + }, + { + "epoch": 0.25942432683379757, + "grad_norm": 1.6884208917617798, + "learning_rate": 8.638613861386138e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.8562040328979492, + "num_tokens": 50930083.0, + "step": 1397 + }, + { + "epoch": 0.2596100278551532, + "grad_norm": 1.525036334991455, + "learning_rate": 8.644801980198019e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.8532981276512146, + "num_tokens": 50967295.0, + "step": 1398 + }, + { + "epoch": 0.2597957288765088, + "grad_norm": 1.5616956949234009, + "learning_rate": 8.650990099009901e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8369069695472717, + "num_tokens": 51006763.0, + "step": 1399 + }, + { + "epoch": 0.25998142989786444, + "grad_norm": 1.568126916885376, + "learning_rate": 8.657178217821783e-07, + "loss": 0.438, + "mean_token_accuracy": 0.8560361862182617, + "num_tokens": 51043265.0, + "step": 1400 + }, + { + "epoch": 0.26016713091922006, + "grad_norm": 1.6008883714675903, + "learning_rate": 8.663366336633663e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.8555823564529419, + "num_tokens": 51078109.0, + "step": 1401 + }, + { + "epoch": 0.2603528319405757, + "grad_norm": 1.7917755842208862, + "learning_rate": 8.669554455445545e-07, + "loss": 0.4036, + "mean_token_accuracy": 0.8665608167648315, + "num_tokens": 51113649.0, + "step": 1402 + }, + { + "epoch": 0.2605385329619313, + "grad_norm": 1.623241901397705, + "learning_rate": 8.675742574257425e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.855699896812439, + "num_tokens": 51149386.0, + "step": 1403 + }, + { + "epoch": 0.2607242339832869, + "grad_norm": 1.8323485851287842, + "learning_rate": 8.681930693069307e-07, + "loss": 0.416, + "mean_token_accuracy": 0.8594357967376709, + "num_tokens": 51180628.0, + "step": 1404 + }, + { + "epoch": 0.26090993500464255, + "grad_norm": 1.6090383529663086, + "learning_rate": 8.688118811881188e-07, + "loss": 0.3764, + "mean_token_accuracy": 0.8722911477088928, + "num_tokens": 51213659.0, + "step": 1405 + }, + { + "epoch": 0.26109563602599817, + "grad_norm": 1.5851584672927856, + "learning_rate": 8.694306930693069e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.847625732421875, + "num_tokens": 51248729.0, + "step": 1406 + }, + { + "epoch": 0.26128133704735373, + "grad_norm": 1.5143529176712036, + "learning_rate": 8.70049504950495e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8458163142204285, + "num_tokens": 51287797.0, + "step": 1407 + }, + { + "epoch": 0.26146703806870936, + "grad_norm": 1.62647545337677, + "learning_rate": 8.706683168316832e-07, + "loss": 0.4362, + "mean_token_accuracy": 0.8523377180099487, + "num_tokens": 51324543.0, + "step": 1408 + }, + { + "epoch": 0.261652739090065, + "grad_norm": 1.547177791595459, + "learning_rate": 8.712871287128712e-07, + "loss": 0.4135, + "mean_token_accuracy": 0.861039400100708, + "num_tokens": 51363074.0, + "step": 1409 + }, + { + "epoch": 0.2618384401114206, + "grad_norm": 1.674726963043213, + "learning_rate": 8.719059405940594e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.852093517780304, + "num_tokens": 51398396.0, + "step": 1410 + }, + { + "epoch": 0.2620241411327762, + "grad_norm": 1.510411024093628, + "learning_rate": 8.725247524752474e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8571573495864868, + "num_tokens": 51436173.0, + "step": 1411 + }, + { + "epoch": 0.26220984215413184, + "grad_norm": 1.5442453622817993, + "learning_rate": 8.731435643564357e-07, + "loss": 0.4162, + "mean_token_accuracy": 0.8582282066345215, + "num_tokens": 51471180.0, + "step": 1412 + }, + { + "epoch": 0.26239554317548747, + "grad_norm": 1.633697509765625, + "learning_rate": 8.737623762376238e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.8451202511787415, + "num_tokens": 51507993.0, + "step": 1413 + }, + { + "epoch": 0.2625812441968431, + "grad_norm": 1.5908843278884888, + "learning_rate": 8.743811881188119e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8477621674537659, + "num_tokens": 51542827.0, + "step": 1414 + }, + { + "epoch": 0.2627669452181987, + "grad_norm": 1.557639718055725, + "learning_rate": 8.75e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8338846564292908, + "num_tokens": 51585523.0, + "step": 1415 + }, + { + "epoch": 0.26295264623955433, + "grad_norm": 1.727674961090088, + "learning_rate": 8.75618811881188e-07, + "loss": 0.3896, + "mean_token_accuracy": 0.8680363297462463, + "num_tokens": 51615441.0, + "step": 1416 + }, + { + "epoch": 0.26313834726090995, + "grad_norm": 1.6919080018997192, + "learning_rate": 8.762376237623762e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.850700855255127, + "num_tokens": 51651192.0, + "step": 1417 + }, + { + "epoch": 0.2633240482822656, + "grad_norm": 1.5902349948883057, + "learning_rate": 8.768564356435643e-07, + "loss": 0.4279, + "mean_token_accuracy": 0.8554903268814087, + "num_tokens": 51690786.0, + "step": 1418 + }, + { + "epoch": 0.26350974930362114, + "grad_norm": 1.5803173780441284, + "learning_rate": 8.774752475247524e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8555134534835815, + "num_tokens": 51725715.0, + "step": 1419 + }, + { + "epoch": 0.26369545032497677, + "grad_norm": 1.776291847229004, + "learning_rate": 8.780940594059405e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8394747376441956, + "num_tokens": 51759355.0, + "step": 1420 + }, + { + "epoch": 0.2638811513463324, + "grad_norm": 1.4368644952774048, + "learning_rate": 8.787128712871287e-07, + "loss": 0.4377, + "mean_token_accuracy": 0.8534578680992126, + "num_tokens": 51807325.0, + "step": 1421 + }, + { + "epoch": 0.264066852367688, + "grad_norm": 1.6916674375534058, + "learning_rate": 8.793316831683167e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.852713942527771, + "num_tokens": 51841388.0, + "step": 1422 + }, + { + "epoch": 0.26425255338904363, + "grad_norm": 1.5847855806350708, + "learning_rate": 8.799504950495049e-07, + "loss": 0.4088, + "mean_token_accuracy": 0.8586974143981934, + "num_tokens": 51875233.0, + "step": 1423 + }, + { + "epoch": 0.26443825441039925, + "grad_norm": 1.721291184425354, + "learning_rate": 8.805693069306929e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8296204209327698, + "num_tokens": 51912725.0, + "step": 1424 + }, + { + "epoch": 0.2646239554317549, + "grad_norm": 1.5573651790618896, + "learning_rate": 8.811881188118812e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8516654968261719, + "num_tokens": 51950635.0, + "step": 1425 + }, + { + "epoch": 0.2648096564531105, + "grad_norm": 1.450399398803711, + "learning_rate": 8.818069306930693e-07, + "loss": 0.3833, + "mean_token_accuracy": 0.8703231811523438, + "num_tokens": 51991838.0, + "step": 1426 + }, + { + "epoch": 0.2649953574744661, + "grad_norm": 1.4766299724578857, + "learning_rate": 8.824257425742574e-07, + "loss": 0.4067, + "mean_token_accuracy": 0.8619920611381531, + "num_tokens": 52036452.0, + "step": 1427 + }, + { + "epoch": 0.26518105849582174, + "grad_norm": 1.6625124216079712, + "learning_rate": 8.830445544554455e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8501671552658081, + "num_tokens": 52071177.0, + "step": 1428 + }, + { + "epoch": 0.26536675951717736, + "grad_norm": 1.5251535177230835, + "learning_rate": 8.836633663366337e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8414909243583679, + "num_tokens": 52112344.0, + "step": 1429 + }, + { + "epoch": 0.265552460538533, + "grad_norm": 1.4761325120925903, + "learning_rate": 8.842821782178217e-07, + "loss": 0.3755, + "mean_token_accuracy": 0.8711363077163696, + "num_tokens": 52148901.0, + "step": 1430 + }, + { + "epoch": 0.26573816155988855, + "grad_norm": 1.6012108325958252, + "learning_rate": 8.849009900990099e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8475784659385681, + "num_tokens": 52187541.0, + "step": 1431 + }, + { + "epoch": 0.2659238625812442, + "grad_norm": 1.4984967708587646, + "learning_rate": 8.855198019801979e-07, + "loss": 0.4292, + "mean_token_accuracy": 0.8562496900558472, + "num_tokens": 52225413.0, + "step": 1432 + }, + { + "epoch": 0.2661095636025998, + "grad_norm": 1.5004431009292603, + "learning_rate": 8.861386138613861e-07, + "loss": 0.469, + "mean_token_accuracy": 0.8461008667945862, + "num_tokens": 52268342.0, + "step": 1433 + }, + { + "epoch": 0.2662952646239554, + "grad_norm": 1.5211517810821533, + "learning_rate": 8.867574257425742e-07, + "loss": 0.4157, + "mean_token_accuracy": 0.8609880208969116, + "num_tokens": 52306082.0, + "step": 1434 + }, + { + "epoch": 0.26648096564531104, + "grad_norm": 1.5577049255371094, + "learning_rate": 8.873762376237623e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8457972407341003, + "num_tokens": 52345308.0, + "step": 1435 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.5568840503692627, + "learning_rate": 8.879950495049504e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.849211573600769, + "num_tokens": 52386673.0, + "step": 1436 + }, + { + "epoch": 0.2668523676880223, + "grad_norm": 1.6944512128829956, + "learning_rate": 8.886138613861385e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.854167103767395, + "num_tokens": 52421790.0, + "step": 1437 + }, + { + "epoch": 0.2670380687093779, + "grad_norm": 1.80433189868927, + "learning_rate": 8.892326732673266e-07, + "loss": 0.4699, + "mean_token_accuracy": 0.8435534834861755, + "num_tokens": 52453617.0, + "step": 1438 + }, + { + "epoch": 0.26722376973073353, + "grad_norm": 1.581249713897705, + "learning_rate": 8.898514851485149e-07, + "loss": 0.4096, + "mean_token_accuracy": 0.8649756908416748, + "num_tokens": 52488382.0, + "step": 1439 + }, + { + "epoch": 0.26740947075208915, + "grad_norm": 1.5940080881118774, + "learning_rate": 8.904702970297029e-07, + "loss": 0.3926, + "mean_token_accuracy": 0.861639142036438, + "num_tokens": 52521412.0, + "step": 1440 + }, + { + "epoch": 0.2675951717734448, + "grad_norm": 1.4906014204025269, + "learning_rate": 8.910891089108911e-07, + "loss": 0.4439, + "mean_token_accuracy": 0.8510125875473022, + "num_tokens": 52563160.0, + "step": 1441 + }, + { + "epoch": 0.2677808727948004, + "grad_norm": 1.5141949653625488, + "learning_rate": 8.917079207920792e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.8482558727264404, + "num_tokens": 52605766.0, + "step": 1442 + }, + { + "epoch": 0.26796657381615596, + "grad_norm": 1.5162806510925293, + "learning_rate": 8.923267326732673e-07, + "loss": 0.418, + "mean_token_accuracy": 0.8567756414413452, + "num_tokens": 52643666.0, + "step": 1443 + }, + { + "epoch": 0.2681522748375116, + "grad_norm": 1.6023648977279663, + "learning_rate": 8.929455445544554e-07, + "loss": 0.4179, + "mean_token_accuracy": 0.8587281107902527, + "num_tokens": 52681378.0, + "step": 1444 + }, + { + "epoch": 0.2683379758588672, + "grad_norm": 1.5431562662124634, + "learning_rate": 8.935643564356435e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8457801342010498, + "num_tokens": 52720865.0, + "step": 1445 + }, + { + "epoch": 0.26852367688022283, + "grad_norm": 1.698068618774414, + "learning_rate": 8.941831683168316e-07, + "loss": 0.4206, + "mean_token_accuracy": 0.8574275970458984, + "num_tokens": 52753507.0, + "step": 1446 + }, + { + "epoch": 0.26870937790157845, + "grad_norm": 1.7849212884902954, + "learning_rate": 8.948019801980198e-07, + "loss": 0.4439, + "mean_token_accuracy": 0.8518151640892029, + "num_tokens": 52786148.0, + "step": 1447 + }, + { + "epoch": 0.26889507892293407, + "grad_norm": 1.5325987339019775, + "learning_rate": 8.954207920792078e-07, + "loss": 0.3819, + "mean_token_accuracy": 0.8692599534988403, + "num_tokens": 52825144.0, + "step": 1448 + }, + { + "epoch": 0.2690807799442897, + "grad_norm": 1.4016051292419434, + "learning_rate": 8.96039603960396e-07, + "loss": 0.3727, + "mean_token_accuracy": 0.8763384222984314, + "num_tokens": 52866292.0, + "step": 1449 + }, + { + "epoch": 0.2692664809656453, + "grad_norm": 1.6044526100158691, + "learning_rate": 8.96658415841584e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8486161231994629, + "num_tokens": 52905297.0, + "step": 1450 + }, + { + "epoch": 0.26945218198700094, + "grad_norm": 1.7397898435592651, + "learning_rate": 8.972772277227722e-07, + "loss": 0.3892, + "mean_token_accuracy": 0.8655217885971069, + "num_tokens": 52934480.0, + "step": 1451 + }, + { + "epoch": 0.26963788300835656, + "grad_norm": 1.7234249114990234, + "learning_rate": 8.978960396039604e-07, + "loss": 0.4199, + "mean_token_accuracy": 0.8643524646759033, + "num_tokens": 52965904.0, + "step": 1452 + }, + { + "epoch": 0.2698235840297122, + "grad_norm": 1.6748510599136353, + "learning_rate": 8.985148514851485e-07, + "loss": 0.4306, + "mean_token_accuracy": 0.858191728591919, + "num_tokens": 52998742.0, + "step": 1453 + }, + { + "epoch": 0.2700092850510678, + "grad_norm": 1.5580826997756958, + "learning_rate": 8.991336633663366e-07, + "loss": 0.4234, + "mean_token_accuracy": 0.858020544052124, + "num_tokens": 53036218.0, + "step": 1454 + }, + { + "epoch": 0.27019498607242337, + "grad_norm": 1.6034225225448608, + "learning_rate": 8.997524752475248e-07, + "loss": 0.4431, + "mean_token_accuracy": 0.854631781578064, + "num_tokens": 53072546.0, + "step": 1455 + }, + { + "epoch": 0.270380687093779, + "grad_norm": 1.6512962579727173, + "learning_rate": 9.003712871287128e-07, + "loss": 0.4192, + "mean_token_accuracy": 0.8563635349273682, + "num_tokens": 53105887.0, + "step": 1456 + }, + { + "epoch": 0.2705663881151346, + "grad_norm": 1.5987272262573242, + "learning_rate": 9.00990099009901e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8400353193283081, + "num_tokens": 53140276.0, + "step": 1457 + }, + { + "epoch": 0.27075208913649024, + "grad_norm": 1.5682605504989624, + "learning_rate": 9.01608910891089e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.855077862739563, + "num_tokens": 53177397.0, + "step": 1458 + }, + { + "epoch": 0.27093779015784586, + "grad_norm": 1.4847042560577393, + "learning_rate": 9.022277227722772e-07, + "loss": 0.4402, + "mean_token_accuracy": 0.8525784015655518, + "num_tokens": 53215731.0, + "step": 1459 + }, + { + "epoch": 0.2711234911792015, + "grad_norm": 1.5963149070739746, + "learning_rate": 9.028465346534653e-07, + "loss": 0.3971, + "mean_token_accuracy": 0.8656208515167236, + "num_tokens": 53250048.0, + "step": 1460 + }, + { + "epoch": 0.2713091922005571, + "grad_norm": 1.6793524026870728, + "learning_rate": 9.034653465346534e-07, + "loss": 0.4584, + "mean_token_accuracy": 0.8525400161743164, + "num_tokens": 53285177.0, + "step": 1461 + }, + { + "epoch": 0.2714948932219127, + "grad_norm": 1.7017728090286255, + "learning_rate": 9.040841584158415e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8385767936706543, + "num_tokens": 53318595.0, + "step": 1462 + }, + { + "epoch": 0.27168059424326835, + "grad_norm": 1.611818790435791, + "learning_rate": 9.047029702970296e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.8546348214149475, + "num_tokens": 53354397.0, + "step": 1463 + }, + { + "epoch": 0.27186629526462397, + "grad_norm": 1.6291546821594238, + "learning_rate": 9.053217821782177e-07, + "loss": 0.4224, + "mean_token_accuracy": 0.8560056090354919, + "num_tokens": 53388606.0, + "step": 1464 + }, + { + "epoch": 0.2720519962859796, + "grad_norm": 1.7180787324905396, + "learning_rate": 9.05940594059406e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8339478969573975, + "num_tokens": 53423691.0, + "step": 1465 + }, + { + "epoch": 0.2722376973073352, + "grad_norm": 1.630706787109375, + "learning_rate": 9.06559405940594e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8430142402648926, + "num_tokens": 53464102.0, + "step": 1466 + }, + { + "epoch": 0.2724233983286908, + "grad_norm": 1.5997439622879028, + "learning_rate": 9.071782178217822e-07, + "loss": 0.42, + "mean_token_accuracy": 0.8587722778320312, + "num_tokens": 53503228.0, + "step": 1467 + }, + { + "epoch": 0.2726090993500464, + "grad_norm": 1.5003688335418701, + "learning_rate": 9.077970297029703e-07, + "loss": 0.3875, + "mean_token_accuracy": 0.8666880130767822, + "num_tokens": 53541402.0, + "step": 1468 + }, + { + "epoch": 0.272794800371402, + "grad_norm": 1.7299476861953735, + "learning_rate": 9.084158415841584e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8382163047790527, + "num_tokens": 53573055.0, + "step": 1469 + }, + { + "epoch": 0.27298050139275765, + "grad_norm": 1.5457146167755127, + "learning_rate": 9.090346534653465e-07, + "loss": 0.3744, + "mean_token_accuracy": 0.8746737241744995, + "num_tokens": 53606536.0, + "step": 1470 + }, + { + "epoch": 0.27316620241411327, + "grad_norm": 1.658724069595337, + "learning_rate": 9.096534653465346e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.8582643866539001, + "num_tokens": 53644415.0, + "step": 1471 + }, + { + "epoch": 0.2733519034354689, + "grad_norm": 1.6065266132354736, + "learning_rate": 9.102722772277227e-07, + "loss": 0.4145, + "mean_token_accuracy": 0.8594632148742676, + "num_tokens": 53680583.0, + "step": 1472 + }, + { + "epoch": 0.2735376044568245, + "grad_norm": 1.6855829954147339, + "learning_rate": 9.108910891089109e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.8581317663192749, + "num_tokens": 53711749.0, + "step": 1473 + }, + { + "epoch": 0.27372330547818013, + "grad_norm": 1.6895012855529785, + "learning_rate": 9.115099009900989e-07, + "loss": 0.4262, + "mean_token_accuracy": 0.8570191264152527, + "num_tokens": 53746039.0, + "step": 1474 + }, + { + "epoch": 0.27390900649953576, + "grad_norm": 1.566157579421997, + "learning_rate": 9.121287128712871e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.8638818264007568, + "num_tokens": 53779912.0, + "step": 1475 + }, + { + "epoch": 0.2740947075208914, + "grad_norm": 1.5696897506713867, + "learning_rate": 9.127475247524751e-07, + "loss": 0.383, + "mean_token_accuracy": 0.8726332783699036, + "num_tokens": 53817193.0, + "step": 1476 + }, + { + "epoch": 0.274280408542247, + "grad_norm": 1.7165493965148926, + "learning_rate": 9.133663366336633e-07, + "loss": 0.4174, + "mean_token_accuracy": 0.8575838208198547, + "num_tokens": 53846294.0, + "step": 1477 + }, + { + "epoch": 0.2744661095636026, + "grad_norm": 1.767100214958191, + "learning_rate": 9.139851485148514e-07, + "loss": 0.4292, + "mean_token_accuracy": 0.8577563762664795, + "num_tokens": 53875916.0, + "step": 1478 + }, + { + "epoch": 0.27465181058495824, + "grad_norm": 1.745151162147522, + "learning_rate": 9.146039603960396e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.8488254547119141, + "num_tokens": 53907316.0, + "step": 1479 + }, + { + "epoch": 0.2748375116063138, + "grad_norm": 1.756557822227478, + "learning_rate": 9.152227722772277e-07, + "loss": 0.469, + "mean_token_accuracy": 0.8409306406974792, + "num_tokens": 53945156.0, + "step": 1480 + }, + { + "epoch": 0.27502321262766943, + "grad_norm": 1.6304421424865723, + "learning_rate": 9.158415841584159e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8490175604820251, + "num_tokens": 53981914.0, + "step": 1481 + }, + { + "epoch": 0.27520891364902506, + "grad_norm": 1.5911978483200073, + "learning_rate": 9.164603960396039e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.8295217752456665, + "num_tokens": 54023304.0, + "step": 1482 + }, + { + "epoch": 0.2753946146703807, + "grad_norm": 1.6154056787490845, + "learning_rate": 9.170792079207921e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8475849628448486, + "num_tokens": 54060076.0, + "step": 1483 + }, + { + "epoch": 0.2755803156917363, + "grad_norm": 1.587768316268921, + "learning_rate": 9.176980198019801e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8447300791740417, + "num_tokens": 54102410.0, + "step": 1484 + }, + { + "epoch": 0.2757660167130919, + "grad_norm": 1.4802321195602417, + "learning_rate": 9.183168316831683e-07, + "loss": 0.427, + "mean_token_accuracy": 0.8550057411193848, + "num_tokens": 54145209.0, + "step": 1485 + }, + { + "epoch": 0.27595171773444754, + "grad_norm": 1.6819912195205688, + "learning_rate": 9.189356435643564e-07, + "loss": 0.4056, + "mean_token_accuracy": 0.8640508651733398, + "num_tokens": 54182222.0, + "step": 1486 + }, + { + "epoch": 0.27613741875580317, + "grad_norm": 1.4930022954940796, + "learning_rate": 9.195544554455445e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8606507778167725, + "num_tokens": 54225041.0, + "step": 1487 + }, + { + "epoch": 0.2763231197771588, + "grad_norm": 1.5240651369094849, + "learning_rate": 9.201732673267326e-07, + "loss": 0.408, + "mean_token_accuracy": 0.8581421375274658, + "num_tokens": 54267215.0, + "step": 1488 + }, + { + "epoch": 0.2765088207985144, + "grad_norm": 1.5225422382354736, + "learning_rate": 9.207920792079208e-07, + "loss": 0.4121, + "mean_token_accuracy": 0.860463559627533, + "num_tokens": 54307963.0, + "step": 1489 + }, + { + "epoch": 0.27669452181987003, + "grad_norm": 1.855689287185669, + "learning_rate": 9.214108910891088e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8527685403823853, + "num_tokens": 54340928.0, + "step": 1490 + }, + { + "epoch": 0.27688022284122565, + "grad_norm": 1.952717661857605, + "learning_rate": 9.220297029702971e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.847786009311676, + "num_tokens": 54376925.0, + "step": 1491 + }, + { + "epoch": 0.2770659238625812, + "grad_norm": 1.7448023557662964, + "learning_rate": 9.226485148514851e-07, + "loss": 0.405, + "mean_token_accuracy": 0.8644581437110901, + "num_tokens": 54410437.0, + "step": 1492 + }, + { + "epoch": 0.27725162488393684, + "grad_norm": 1.5951427221298218, + "learning_rate": 9.232673267326733e-07, + "loss": 0.4043, + "mean_token_accuracy": 0.8644431829452515, + "num_tokens": 54447642.0, + "step": 1493 + }, + { + "epoch": 0.27743732590529246, + "grad_norm": 1.6683961153030396, + "learning_rate": 9.238861386138614e-07, + "loss": 0.3784, + "mean_token_accuracy": 0.8667295575141907, + "num_tokens": 54481350.0, + "step": 1494 + }, + { + "epoch": 0.2776230269266481, + "grad_norm": 1.5370104312896729, + "learning_rate": 9.245049504950495e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8435863256454468, + "num_tokens": 54524548.0, + "step": 1495 + }, + { + "epoch": 0.2778087279480037, + "grad_norm": 1.6737775802612305, + "learning_rate": 9.251237623762376e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8442898988723755, + "num_tokens": 54558549.0, + "step": 1496 + }, + { + "epoch": 0.27799442896935933, + "grad_norm": 1.6064640283584595, + "learning_rate": 9.257425742574257e-07, + "loss": 0.4152, + "mean_token_accuracy": 0.8618606328964233, + "num_tokens": 54595201.0, + "step": 1497 + }, + { + "epoch": 0.27818012999071495, + "grad_norm": 1.7841633558273315, + "learning_rate": 9.263613861386138e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.8556554317474365, + "num_tokens": 54628111.0, + "step": 1498 + }, + { + "epoch": 0.2783658310120706, + "grad_norm": 1.5575270652770996, + "learning_rate": 9.26980198019802e-07, + "loss": 0.427, + "mean_token_accuracy": 0.858113169670105, + "num_tokens": 54665154.0, + "step": 1499 + }, + { + "epoch": 0.2785515320334262, + "grad_norm": 1.6668357849121094, + "learning_rate": 9.2759900990099e-07, + "loss": 0.5104, + "mean_token_accuracy": 0.8307089805603027, + "num_tokens": 54702343.0, + "step": 1500 + }, + { + "epoch": 0.2787372330547818, + "grad_norm": 1.598191499710083, + "learning_rate": 9.282178217821782e-07, + "loss": 0.3922, + "mean_token_accuracy": 0.8647867441177368, + "num_tokens": 54736604.0, + "step": 1501 + }, + { + "epoch": 0.27892293407613744, + "grad_norm": 1.6790306568145752, + "learning_rate": 9.288366336633663e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.8273780345916748, + "num_tokens": 54774802.0, + "step": 1502 + }, + { + "epoch": 0.27910863509749306, + "grad_norm": 1.698122262954712, + "learning_rate": 9.294554455445544e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.8361957669258118, + "num_tokens": 54806976.0, + "step": 1503 + }, + { + "epoch": 0.27929433611884863, + "grad_norm": 1.555909276008606, + "learning_rate": 9.300742574257425e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8458950519561768, + "num_tokens": 54845398.0, + "step": 1504 + }, + { + "epoch": 0.27948003714020425, + "grad_norm": 1.5614256858825684, + "learning_rate": 9.306930693069307e-07, + "loss": 0.3657, + "mean_token_accuracy": 0.8763972520828247, + "num_tokens": 54880867.0, + "step": 1505 + }, + { + "epoch": 0.2796657381615599, + "grad_norm": 1.6570076942443848, + "learning_rate": 9.313118811881188e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8575561046600342, + "num_tokens": 54913524.0, + "step": 1506 + }, + { + "epoch": 0.2798514391829155, + "grad_norm": 1.8240456581115723, + "learning_rate": 9.31930693069307e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8426343202590942, + "num_tokens": 54944879.0, + "step": 1507 + }, + { + "epoch": 0.2800371402042711, + "grad_norm": 1.53097403049469, + "learning_rate": 9.32549504950495e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8475320339202881, + "num_tokens": 54985361.0, + "step": 1508 + }, + { + "epoch": 0.28022284122562674, + "grad_norm": 1.6654092073440552, + "learning_rate": 9.331683168316832e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.8544930815696716, + "num_tokens": 55023492.0, + "step": 1509 + }, + { + "epoch": 0.28040854224698236, + "grad_norm": 1.46049964427948, + "learning_rate": 9.337871287128712e-07, + "loss": 0.409, + "mean_token_accuracy": 0.8639415502548218, + "num_tokens": 55061037.0, + "step": 1510 + }, + { + "epoch": 0.280594243268338, + "grad_norm": 1.6185733079910278, + "learning_rate": 9.344059405940594e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8522621989250183, + "num_tokens": 55096956.0, + "step": 1511 + }, + { + "epoch": 0.2807799442896936, + "grad_norm": 1.6439701318740845, + "learning_rate": 9.350247524752475e-07, + "loss": 0.4223, + "mean_token_accuracy": 0.8570594787597656, + "num_tokens": 55129597.0, + "step": 1512 + }, + { + "epoch": 0.28096564531104923, + "grad_norm": 1.6056468486785889, + "learning_rate": 9.356435643564356e-07, + "loss": 0.3778, + "mean_token_accuracy": 0.8732120990753174, + "num_tokens": 55163148.0, + "step": 1513 + }, + { + "epoch": 0.28115134633240485, + "grad_norm": 1.5463666915893555, + "learning_rate": 9.362623762376237e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8546167016029358, + "num_tokens": 55200160.0, + "step": 1514 + }, + { + "epoch": 0.28133704735376047, + "grad_norm": 1.6822599172592163, + "learning_rate": 9.368811881188119e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.85590660572052, + "num_tokens": 55235323.0, + "step": 1515 + }, + { + "epoch": 0.28152274837511604, + "grad_norm": 1.6191574335098267, + "learning_rate": 9.374999999999999e-07, + "loss": 0.5197, + "mean_token_accuracy": 0.833986759185791, + "num_tokens": 55275913.0, + "step": 1516 + }, + { + "epoch": 0.28170844939647166, + "grad_norm": 1.6868116855621338, + "learning_rate": 9.38118811881188e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8433272838592529, + "num_tokens": 55310641.0, + "step": 1517 + }, + { + "epoch": 0.2818941504178273, + "grad_norm": 1.6660962104797363, + "learning_rate": 9.387376237623762e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.8495337963104248, + "num_tokens": 55349267.0, + "step": 1518 + }, + { + "epoch": 0.2820798514391829, + "grad_norm": 1.6183419227600098, + "learning_rate": 9.393564356435643e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8528720140457153, + "num_tokens": 55384297.0, + "step": 1519 + }, + { + "epoch": 0.2822655524605385, + "grad_norm": 1.3991355895996094, + "learning_rate": 9.399752475247525e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8459113836288452, + "num_tokens": 55430852.0, + "step": 1520 + }, + { + "epoch": 0.28245125348189415, + "grad_norm": 1.6154143810272217, + "learning_rate": 9.405940594059405e-07, + "loss": 0.3986, + "mean_token_accuracy": 0.8662365674972534, + "num_tokens": 55465778.0, + "step": 1521 + }, + { + "epoch": 0.28263695450324977, + "grad_norm": 1.635103464126587, + "learning_rate": 9.412128712871287e-07, + "loss": 0.439, + "mean_token_accuracy": 0.8509831428527832, + "num_tokens": 55499305.0, + "step": 1522 + }, + { + "epoch": 0.2828226555246054, + "grad_norm": 1.5632140636444092, + "learning_rate": 9.418316831683168e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8502670526504517, + "num_tokens": 55537769.0, + "step": 1523 + }, + { + "epoch": 0.283008356545961, + "grad_norm": 1.4974421262741089, + "learning_rate": 9.424504950495049e-07, + "loss": 0.4353, + "mean_token_accuracy": 0.8545238971710205, + "num_tokens": 55578743.0, + "step": 1524 + }, + { + "epoch": 0.28319405756731664, + "grad_norm": 1.5087122917175293, + "learning_rate": 9.43069306930693e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.8498827219009399, + "num_tokens": 55617579.0, + "step": 1525 + }, + { + "epoch": 0.28337975858867226, + "grad_norm": 1.7088738679885864, + "learning_rate": 9.436881188118811e-07, + "loss": 0.4196, + "mean_token_accuracy": 0.8592530488967896, + "num_tokens": 55650538.0, + "step": 1526 + }, + { + "epoch": 0.2835654596100279, + "grad_norm": 1.6544712781906128, + "learning_rate": 9.443069306930692e-07, + "loss": 0.4049, + "mean_token_accuracy": 0.8677226305007935, + "num_tokens": 55684554.0, + "step": 1527 + }, + { + "epoch": 0.28375116063138345, + "grad_norm": 1.5109279155731201, + "learning_rate": 9.449257425742574e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8486942052841187, + "num_tokens": 55723391.0, + "step": 1528 + }, + { + "epoch": 0.28393686165273907, + "grad_norm": 1.6294766664505005, + "learning_rate": 9.455445544554454e-07, + "loss": 0.4106, + "mean_token_accuracy": 0.8647583723068237, + "num_tokens": 55760083.0, + "step": 1529 + }, + { + "epoch": 0.2841225626740947, + "grad_norm": 1.6245709657669067, + "learning_rate": 9.461633663366336e-07, + "loss": 0.3816, + "mean_token_accuracy": 0.8686017394065857, + "num_tokens": 55795405.0, + "step": 1530 + }, + { + "epoch": 0.2843082636954503, + "grad_norm": 1.4926528930664062, + "learning_rate": 9.467821782178216e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8589589595794678, + "num_tokens": 55839389.0, + "step": 1531 + }, + { + "epoch": 0.28449396471680594, + "grad_norm": 1.6511917114257812, + "learning_rate": 9.474009900990099e-07, + "loss": 0.3915, + "mean_token_accuracy": 0.8673043847084045, + "num_tokens": 55871620.0, + "step": 1532 + }, + { + "epoch": 0.28467966573816156, + "grad_norm": 1.436730980873108, + "learning_rate": 9.48019801980198e-07, + "loss": 0.4123, + "mean_token_accuracy": 0.8597041368484497, + "num_tokens": 55914613.0, + "step": 1533 + }, + { + "epoch": 0.2848653667595172, + "grad_norm": 1.5943353176116943, + "learning_rate": 9.486386138613861e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8388098478317261, + "num_tokens": 55956095.0, + "step": 1534 + }, + { + "epoch": 0.2850510677808728, + "grad_norm": 1.6088535785675049, + "learning_rate": 9.492574257425742e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8453937768936157, + "num_tokens": 55992585.0, + "step": 1535 + }, + { + "epoch": 0.2852367688022284, + "grad_norm": 1.5032228231430054, + "learning_rate": 9.498762376237624e-07, + "loss": 0.4269, + "mean_token_accuracy": 0.85645592212677, + "num_tokens": 56031091.0, + "step": 1536 + }, + { + "epoch": 0.28542246982358405, + "grad_norm": 1.5320658683776855, + "learning_rate": 9.504950495049504e-07, + "loss": 0.4083, + "mean_token_accuracy": 0.8618971109390259, + "num_tokens": 56067392.0, + "step": 1537 + }, + { + "epoch": 0.28560817084493967, + "grad_norm": 1.7108992338180542, + "learning_rate": 9.511138613861386e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8488082885742188, + "num_tokens": 56099897.0, + "step": 1538 + }, + { + "epoch": 0.2857938718662953, + "grad_norm": 1.577620029449463, + "learning_rate": 9.517326732673266e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.857927680015564, + "num_tokens": 56137826.0, + "step": 1539 + }, + { + "epoch": 0.28597957288765086, + "grad_norm": 1.6132196187973022, + "learning_rate": 9.523514851485148e-07, + "loss": 0.3716, + "mean_token_accuracy": 0.8750404119491577, + "num_tokens": 56172040.0, + "step": 1540 + }, + { + "epoch": 0.2861652739090065, + "grad_norm": 1.651103138923645, + "learning_rate": 9.529702970297029e-07, + "loss": 0.4103, + "mean_token_accuracy": 0.8576169610023499, + "num_tokens": 56203584.0, + "step": 1541 + }, + { + "epoch": 0.2863509749303621, + "grad_norm": 1.5842593908309937, + "learning_rate": 9.53589108910891e-07, + "loss": 0.4072, + "mean_token_accuracy": 0.8628799915313721, + "num_tokens": 56237198.0, + "step": 1542 + }, + { + "epoch": 0.2865366759517177, + "grad_norm": 1.5818469524383545, + "learning_rate": 9.542079207920792e-07, + "loss": 0.4503, + "mean_token_accuracy": 0.8476930856704712, + "num_tokens": 56275071.0, + "step": 1543 + }, + { + "epoch": 0.28672237697307335, + "grad_norm": 1.5942894220352173, + "learning_rate": 9.548267326732672e-07, + "loss": 0.4352, + "mean_token_accuracy": 0.8539393544197083, + "num_tokens": 56310633.0, + "step": 1544 + }, + { + "epoch": 0.28690807799442897, + "grad_norm": 1.7603044509887695, + "learning_rate": 9.554455445544553e-07, + "loss": 0.434, + "mean_token_accuracy": 0.8529263734817505, + "num_tokens": 56344593.0, + "step": 1545 + }, + { + "epoch": 0.2870937790157846, + "grad_norm": 1.775220513343811, + "learning_rate": 9.560643564356436e-07, + "loss": 0.4542, + "mean_token_accuracy": 0.8459818959236145, + "num_tokens": 56380371.0, + "step": 1546 + }, + { + "epoch": 0.2872794800371402, + "grad_norm": 1.6011356115341187, + "learning_rate": 9.566831683168316e-07, + "loss": 0.446, + "mean_token_accuracy": 0.8561841249465942, + "num_tokens": 56415267.0, + "step": 1547 + }, + { + "epoch": 0.28746518105849583, + "grad_norm": 1.4324475526809692, + "learning_rate": 9.573019801980197e-07, + "loss": 0.3729, + "mean_token_accuracy": 0.8735769987106323, + "num_tokens": 56457335.0, + "step": 1548 + }, + { + "epoch": 0.28765088207985146, + "grad_norm": 1.6494214534759521, + "learning_rate": 9.579207920792078e-07, + "loss": 0.4441, + "mean_token_accuracy": 0.8547149300575256, + "num_tokens": 56491526.0, + "step": 1549 + }, + { + "epoch": 0.2878365831012071, + "grad_norm": 1.8666718006134033, + "learning_rate": 9.58539603960396e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8439765572547913, + "num_tokens": 56518272.0, + "step": 1550 + }, + { + "epoch": 0.2880222841225627, + "grad_norm": 1.5260891914367676, + "learning_rate": 9.59158415841584e-07, + "loss": 0.3654, + "mean_token_accuracy": 0.8732427358627319, + "num_tokens": 56551758.0, + "step": 1551 + }, + { + "epoch": 0.28820798514391827, + "grad_norm": 1.522660732269287, + "learning_rate": 9.597772277227723e-07, + "loss": 0.3787, + "mean_token_accuracy": 0.8685113787651062, + "num_tokens": 56587378.0, + "step": 1552 + }, + { + "epoch": 0.2883936861652739, + "grad_norm": 1.580430507659912, + "learning_rate": 9.603960396039604e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.8514842987060547, + "num_tokens": 56624012.0, + "step": 1553 + }, + { + "epoch": 0.2885793871866295, + "grad_norm": 1.5248355865478516, + "learning_rate": 9.610148514851485e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.855042040348053, + "num_tokens": 56659614.0, + "step": 1554 + }, + { + "epoch": 0.28876508820798513, + "grad_norm": 1.625423789024353, + "learning_rate": 9.616336633663365e-07, + "loss": 0.4453, + "mean_token_accuracy": 0.8536367416381836, + "num_tokens": 56695594.0, + "step": 1555 + }, + { + "epoch": 0.28895078922934075, + "grad_norm": 1.6108883619308472, + "learning_rate": 9.622524752475248e-07, + "loss": 0.4189, + "mean_token_accuracy": 0.8584017753601074, + "num_tokens": 56728073.0, + "step": 1556 + }, + { + "epoch": 0.2891364902506964, + "grad_norm": 1.6028655767440796, + "learning_rate": 9.628712871287129e-07, + "loss": 0.4306, + "mean_token_accuracy": 0.8553062081336975, + "num_tokens": 56767272.0, + "step": 1557 + }, + { + "epoch": 0.289322191272052, + "grad_norm": 1.7100987434387207, + "learning_rate": 9.63490099009901e-07, + "loss": 0.4306, + "mean_token_accuracy": 0.8537960648536682, + "num_tokens": 56802508.0, + "step": 1558 + }, + { + "epoch": 0.2895078922934076, + "grad_norm": 2.103381395339966, + "learning_rate": 9.64108910891089e-07, + "loss": 0.4481, + "mean_token_accuracy": 0.852236807346344, + "num_tokens": 56837676.0, + "step": 1559 + }, + { + "epoch": 0.28969359331476324, + "grad_norm": 1.5628799200057983, + "learning_rate": 9.647277227722772e-07, + "loss": 0.4319, + "mean_token_accuracy": 0.8533445596694946, + "num_tokens": 56871407.0, + "step": 1560 + }, + { + "epoch": 0.28987929433611886, + "grad_norm": 1.6233984231948853, + "learning_rate": 9.653465346534653e-07, + "loss": 0.4, + "mean_token_accuracy": 0.8662494421005249, + "num_tokens": 56905994.0, + "step": 1561 + }, + { + "epoch": 0.2900649953574745, + "grad_norm": 1.6414681673049927, + "learning_rate": 9.659653465346534e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8501232862472534, + "num_tokens": 56939616.0, + "step": 1562 + }, + { + "epoch": 0.2902506963788301, + "grad_norm": 1.4183579683303833, + "learning_rate": 9.665841584158414e-07, + "loss": 0.4377, + "mean_token_accuracy": 0.8520117402076721, + "num_tokens": 56982956.0, + "step": 1563 + }, + { + "epoch": 0.2904363974001857, + "grad_norm": 1.9247864484786987, + "learning_rate": 9.672029702970297e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8431386351585388, + "num_tokens": 57009070.0, + "step": 1564 + }, + { + "epoch": 0.2906220984215413, + "grad_norm": 1.4939725399017334, + "learning_rate": 9.678217821782177e-07, + "loss": 0.4342, + "mean_token_accuracy": 0.8534002304077148, + "num_tokens": 57047853.0, + "step": 1565 + }, + { + "epoch": 0.2908077994428969, + "grad_norm": 1.613704800605774, + "learning_rate": 9.68440594059406e-07, + "loss": 0.4277, + "mean_token_accuracy": 0.8558313846588135, + "num_tokens": 57084682.0, + "step": 1566 + }, + { + "epoch": 0.29099350046425254, + "grad_norm": 1.6551445722579956, + "learning_rate": 9.69059405940594e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.854617178440094, + "num_tokens": 57118395.0, + "step": 1567 + }, + { + "epoch": 0.29117920148560816, + "grad_norm": 1.5787100791931152, + "learning_rate": 9.696782178217821e-07, + "loss": 0.3983, + "mean_token_accuracy": 0.8664171695709229, + "num_tokens": 57156683.0, + "step": 1568 + }, + { + "epoch": 0.2913649025069638, + "grad_norm": 1.5883411169052124, + "learning_rate": 9.702970297029702e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8499621748924255, + "num_tokens": 57196873.0, + "step": 1569 + }, + { + "epoch": 0.2915506035283194, + "grad_norm": 1.5164421796798706, + "learning_rate": 9.709158415841585e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8580493927001953, + "num_tokens": 57236359.0, + "step": 1570 + }, + { + "epoch": 0.29173630454967503, + "grad_norm": 1.508772850036621, + "learning_rate": 9.715346534653465e-07, + "loss": 0.4586, + "mean_token_accuracy": 0.8475860357284546, + "num_tokens": 57275767.0, + "step": 1571 + }, + { + "epoch": 0.29192200557103065, + "grad_norm": 1.570620059967041, + "learning_rate": 9.721534653465346e-07, + "loss": 0.4389, + "mean_token_accuracy": 0.851467490196228, + "num_tokens": 57312465.0, + "step": 1572 + }, + { + "epoch": 0.2921077065923863, + "grad_norm": 1.6344523429870605, + "learning_rate": 9.727722772277226e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.8537894487380981, + "num_tokens": 57347143.0, + "step": 1573 + }, + { + "epoch": 0.2922934076137419, + "grad_norm": 1.846683382987976, + "learning_rate": 9.73391089108911e-07, + "loss": 0.3732, + "mean_token_accuracy": 0.8678882122039795, + "num_tokens": 57376521.0, + "step": 1574 + }, + { + "epoch": 0.2924791086350975, + "grad_norm": 1.61351478099823, + "learning_rate": 9.74009900990099e-07, + "loss": 0.427, + "mean_token_accuracy": 0.852542519569397, + "num_tokens": 57411280.0, + "step": 1575 + }, + { + "epoch": 0.2926648096564531, + "grad_norm": 1.57405424118042, + "learning_rate": 9.74628712871287e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8419618606567383, + "num_tokens": 57450145.0, + "step": 1576 + }, + { + "epoch": 0.2928505106778087, + "grad_norm": 1.5723146200180054, + "learning_rate": 9.75247524752475e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.8468043208122253, + "num_tokens": 57489710.0, + "step": 1577 + }, + { + "epoch": 0.29303621169916433, + "grad_norm": 1.437626600265503, + "learning_rate": 9.758663366336633e-07, + "loss": 0.4115, + "mean_token_accuracy": 0.8597531318664551, + "num_tokens": 57530166.0, + "step": 1578 + }, + { + "epoch": 0.29322191272051995, + "grad_norm": 1.69264554977417, + "learning_rate": 9.764851485148514e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.8537781834602356, + "num_tokens": 57565168.0, + "step": 1579 + }, + { + "epoch": 0.2934076137418756, + "grad_norm": 1.4405778646469116, + "learning_rate": 9.771039603960397e-07, + "loss": 0.3832, + "mean_token_accuracy": 0.8672972917556763, + "num_tokens": 57601738.0, + "step": 1580 + }, + { + "epoch": 0.2935933147632312, + "grad_norm": 1.5922826528549194, + "learning_rate": 9.777227722772277e-07, + "loss": 0.4064, + "mean_token_accuracy": 0.8604366183280945, + "num_tokens": 57637927.0, + "step": 1581 + }, + { + "epoch": 0.2937790157845868, + "grad_norm": 1.496741771697998, + "learning_rate": 9.783415841584158e-07, + "loss": 0.3909, + "mean_token_accuracy": 0.8661201596260071, + "num_tokens": 57674041.0, + "step": 1582 + }, + { + "epoch": 0.29396471680594244, + "grad_norm": 1.4491342306137085, + "learning_rate": 9.789603960396039e-07, + "loss": 0.4371, + "mean_token_accuracy": 0.851347804069519, + "num_tokens": 57712912.0, + "step": 1583 + }, + { + "epoch": 0.29415041782729806, + "grad_norm": 1.5565966367721558, + "learning_rate": 9.795792079207921e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8375394940376282, + "num_tokens": 57751117.0, + "step": 1584 + }, + { + "epoch": 0.2943361188486537, + "grad_norm": 1.6818296909332275, + "learning_rate": 9.801980198019802e-07, + "loss": 0.3756, + "mean_token_accuracy": 0.8718197345733643, + "num_tokens": 57785570.0, + "step": 1585 + }, + { + "epoch": 0.2945218198700093, + "grad_norm": 1.5652729272842407, + "learning_rate": 9.808168316831682e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.8542141914367676, + "num_tokens": 57823782.0, + "step": 1586 + }, + { + "epoch": 0.2947075208913649, + "grad_norm": 1.5864760875701904, + "learning_rate": 9.814356435643563e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.8529244065284729, + "num_tokens": 57861857.0, + "step": 1587 + }, + { + "epoch": 0.2948932219127205, + "grad_norm": 1.6123765707015991, + "learning_rate": 9.820544554455446e-07, + "loss": 0.3999, + "mean_token_accuracy": 0.8653579950332642, + "num_tokens": 57899223.0, + "step": 1588 + }, + { + "epoch": 0.2950789229340761, + "grad_norm": 1.6039544343948364, + "learning_rate": 9.826732673267326e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8516278266906738, + "num_tokens": 57936597.0, + "step": 1589 + }, + { + "epoch": 0.29526462395543174, + "grad_norm": 1.5751869678497314, + "learning_rate": 9.832920792079207e-07, + "loss": 0.4188, + "mean_token_accuracy": 0.855880081653595, + "num_tokens": 57973395.0, + "step": 1590 + }, + { + "epoch": 0.29545032497678736, + "grad_norm": 1.6449486017227173, + "learning_rate": 9.83910891089109e-07, + "loss": 0.4111, + "mean_token_accuracy": 0.8606522083282471, + "num_tokens": 58005489.0, + "step": 1591 + }, + { + "epoch": 0.295636025998143, + "grad_norm": 1.633993148803711, + "learning_rate": 9.84529702970297e-07, + "loss": 0.4172, + "mean_token_accuracy": 0.8555951118469238, + "num_tokens": 58040710.0, + "step": 1592 + }, + { + "epoch": 0.2958217270194986, + "grad_norm": 1.5209332704544067, + "learning_rate": 9.85148514851485e-07, + "loss": 0.3915, + "mean_token_accuracy": 0.8655099868774414, + "num_tokens": 58077662.0, + "step": 1593 + }, + { + "epoch": 0.2960074280408542, + "grad_norm": 1.5940239429473877, + "learning_rate": 9.857673267326733e-07, + "loss": 0.4218, + "mean_token_accuracy": 0.8593885898590088, + "num_tokens": 58115264.0, + "step": 1594 + }, + { + "epoch": 0.29619312906220985, + "grad_norm": 1.4961340427398682, + "learning_rate": 9.863861386138614e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8497849702835083, + "num_tokens": 58156689.0, + "step": 1595 + }, + { + "epoch": 0.29637883008356547, + "grad_norm": 1.611639142036438, + "learning_rate": 9.870049504950495e-07, + "loss": 0.3874, + "mean_token_accuracy": 0.8682063221931458, + "num_tokens": 58188038.0, + "step": 1596 + }, + { + "epoch": 0.2965645311049211, + "grad_norm": 1.552751898765564, + "learning_rate": 9.876237623762375e-07, + "loss": 0.4171, + "mean_token_accuracy": 0.858039140701294, + "num_tokens": 58224347.0, + "step": 1597 + }, + { + "epoch": 0.2967502321262767, + "grad_norm": 1.8299579620361328, + "learning_rate": 9.882425742574258e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8378880620002747, + "num_tokens": 58256156.0, + "step": 1598 + }, + { + "epoch": 0.29693593314763234, + "grad_norm": 1.5571544170379639, + "learning_rate": 9.888613861386138e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8462426662445068, + "num_tokens": 58293987.0, + "step": 1599 + }, + { + "epoch": 0.2971216341689879, + "grad_norm": 1.4996352195739746, + "learning_rate": 9.89480198019802e-07, + "loss": 0.4185, + "mean_token_accuracy": 0.8583632707595825, + "num_tokens": 58330646.0, + "step": 1600 + }, + { + "epoch": 0.2973073351903435, + "grad_norm": 1.5713175535202026, + "learning_rate": 9.9009900990099e-07, + "loss": 0.4039, + "mean_token_accuracy": 0.862694263458252, + "num_tokens": 58368403.0, + "step": 1601 + }, + { + "epoch": 0.29749303621169915, + "grad_norm": 1.6888011693954468, + "learning_rate": 9.907178217821782e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.847699761390686, + "num_tokens": 58401400.0, + "step": 1602 + }, + { + "epoch": 0.29767873723305477, + "grad_norm": 1.568377137184143, + "learning_rate": 9.913366336633663e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8468320369720459, + "num_tokens": 58435228.0, + "step": 1603 + }, + { + "epoch": 0.2978644382544104, + "grad_norm": 1.5015242099761963, + "learning_rate": 9.919554455445546e-07, + "loss": 0.424, + "mean_token_accuracy": 0.8575040102005005, + "num_tokens": 58474145.0, + "step": 1604 + }, + { + "epoch": 0.298050139275766, + "grad_norm": 1.6889593601226807, + "learning_rate": 9.925742574257426e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8276218175888062, + "num_tokens": 58508851.0, + "step": 1605 + }, + { + "epoch": 0.29823584029712163, + "grad_norm": 1.4579923152923584, + "learning_rate": 9.931930693069307e-07, + "loss": 0.4503, + "mean_token_accuracy": 0.8497328162193298, + "num_tokens": 58550721.0, + "step": 1606 + }, + { + "epoch": 0.29842154131847726, + "grad_norm": 1.4063005447387695, + "learning_rate": 9.938118811881187e-07, + "loss": 0.3812, + "mean_token_accuracy": 0.8699814081192017, + "num_tokens": 58590517.0, + "step": 1607 + }, + { + "epoch": 0.2986072423398329, + "grad_norm": 1.5734186172485352, + "learning_rate": 9.94430693069307e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8423557877540588, + "num_tokens": 58628809.0, + "step": 1608 + }, + { + "epoch": 0.2987929433611885, + "grad_norm": 1.6694444417953491, + "learning_rate": 9.95049504950495e-07, + "loss": 0.4302, + "mean_token_accuracy": 0.857363224029541, + "num_tokens": 58664068.0, + "step": 1609 + }, + { + "epoch": 0.2989786443825441, + "grad_norm": 1.6173219680786133, + "learning_rate": 9.956683168316831e-07, + "loss": 0.4209, + "mean_token_accuracy": 0.8576443195343018, + "num_tokens": 58698281.0, + "step": 1610 + }, + { + "epoch": 0.29916434540389975, + "grad_norm": 1.4991798400878906, + "learning_rate": 9.962871287128712e-07, + "loss": 0.4028, + "mean_token_accuracy": 0.8645110130310059, + "num_tokens": 58736618.0, + "step": 1611 + }, + { + "epoch": 0.2993500464252553, + "grad_norm": 1.5195391178131104, + "learning_rate": 9.969059405940595e-07, + "loss": 0.4683, + "mean_token_accuracy": 0.8443461656570435, + "num_tokens": 58779992.0, + "step": 1612 + }, + { + "epoch": 0.29953574744661093, + "grad_norm": 1.6957969665527344, + "learning_rate": 9.975247524752475e-07, + "loss": 0.446, + "mean_token_accuracy": 0.8472301959991455, + "num_tokens": 58811731.0, + "step": 1613 + }, + { + "epoch": 0.29972144846796656, + "grad_norm": 1.5109670162200928, + "learning_rate": 9.981435643564356e-07, + "loss": 0.424, + "mean_token_accuracy": 0.8590554594993591, + "num_tokens": 58850424.0, + "step": 1614 + }, + { + "epoch": 0.2999071494893222, + "grad_norm": 1.656172752380371, + "learning_rate": 9.987623762376236e-07, + "loss": 0.4271, + "mean_token_accuracy": 0.8556394577026367, + "num_tokens": 58884929.0, + "step": 1615 + }, + { + "epoch": 0.3000928505106778, + "grad_norm": 1.60227632522583, + "learning_rate": 9.99381188118812e-07, + "loss": 0.4071, + "mean_token_accuracy": 0.8629167675971985, + "num_tokens": 58916755.0, + "step": 1616 + }, + { + "epoch": 0.3002785515320334, + "grad_norm": 1.5027446746826172, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8704879283905029, + "num_tokens": 58953101.0, + "step": 1617 + }, + { + "epoch": 0.30046425255338904, + "grad_norm": 1.7971463203430176, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8351997137069702, + "num_tokens": 58987335.0, + "step": 1618 + }, + { + "epoch": 0.30064995357474467, + "grad_norm": 1.5596165657043457, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8424156308174133, + "num_tokens": 59026939.0, + "step": 1619 + }, + { + "epoch": 0.3008356545961003, + "grad_norm": 1.6679121255874634, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8653750419616699, + "num_tokens": 59056832.0, + "step": 1620 + }, + { + "epoch": 0.3010213556174559, + "grad_norm": 1.6956647634506226, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8625099658966064, + "num_tokens": 59093479.0, + "step": 1621 + }, + { + "epoch": 0.30120705663881153, + "grad_norm": 1.6003053188323975, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8561068177223206, + "num_tokens": 59132676.0, + "step": 1622 + }, + { + "epoch": 0.30139275766016715, + "grad_norm": 1.5703908205032349, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8624037504196167, + "num_tokens": 59169082.0, + "step": 1623 + }, + { + "epoch": 0.3015784586815227, + "grad_norm": 1.4547077417373657, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8705307245254517, + "num_tokens": 59206612.0, + "step": 1624 + }, + { + "epoch": 0.30176415970287834, + "grad_norm": 1.7470144033432007, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8473606109619141, + "num_tokens": 59238929.0, + "step": 1625 + }, + { + "epoch": 0.30194986072423396, + "grad_norm": 1.4754481315612793, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8697811961174011, + "num_tokens": 59274833.0, + "step": 1626 + }, + { + "epoch": 0.3021355617455896, + "grad_norm": 1.5720665454864502, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.850556492805481, + "num_tokens": 59312745.0, + "step": 1627 + }, + { + "epoch": 0.3023212627669452, + "grad_norm": 1.49203360080719, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.856151819229126, + "num_tokens": 59352769.0, + "step": 1628 + }, + { + "epoch": 0.30250696378830083, + "grad_norm": 1.5840622186660767, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8524740934371948, + "num_tokens": 59391320.0, + "step": 1629 + }, + { + "epoch": 0.30269266480965645, + "grad_norm": 1.857159972190857, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8554304838180542, + "num_tokens": 59419139.0, + "step": 1630 + }, + { + "epoch": 0.3028783658310121, + "grad_norm": 1.6314131021499634, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8552300930023193, + "num_tokens": 59452289.0, + "step": 1631 + }, + { + "epoch": 0.3030640668523677, + "grad_norm": 1.6151379346847534, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8605693578720093, + "num_tokens": 59485756.0, + "step": 1632 + }, + { + "epoch": 0.3032497678737233, + "grad_norm": 1.5397894382476807, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8356566429138184, + "num_tokens": 59528824.0, + "step": 1633 + }, + { + "epoch": 0.30343546889507894, + "grad_norm": 1.7434581518173218, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.851819634437561, + "num_tokens": 59563723.0, + "step": 1634 + }, + { + "epoch": 0.30362116991643456, + "grad_norm": 1.5998631715774536, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8557727932929993, + "num_tokens": 59600263.0, + "step": 1635 + }, + { + "epoch": 0.3038068709377902, + "grad_norm": 1.509209156036377, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8629458546638489, + "num_tokens": 59635183.0, + "step": 1636 + }, + { + "epoch": 0.30399257195914575, + "grad_norm": 1.5112735033035278, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8617103695869446, + "num_tokens": 59674809.0, + "step": 1637 + }, + { + "epoch": 0.3041782729805014, + "grad_norm": 1.4684494733810425, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8605333566665649, + "num_tokens": 59717443.0, + "step": 1638 + }, + { + "epoch": 0.304363974001857, + "grad_norm": 1.5695955753326416, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8779253959655762, + "num_tokens": 59752026.0, + "step": 1639 + }, + { + "epoch": 0.3045496750232126, + "grad_norm": 1.4997321367263794, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8663585782051086, + "num_tokens": 59788433.0, + "step": 1640 + }, + { + "epoch": 0.30473537604456824, + "grad_norm": 1.523370623588562, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8640961647033691, + "num_tokens": 59821525.0, + "step": 1641 + }, + { + "epoch": 0.30492107706592386, + "grad_norm": 1.500758409500122, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8633348941802979, + "num_tokens": 59857793.0, + "step": 1642 + }, + { + "epoch": 0.3051067780872795, + "grad_norm": 1.4402542114257812, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8714995384216309, + "num_tokens": 59896681.0, + "step": 1643 + }, + { + "epoch": 0.3052924791086351, + "grad_norm": 1.5787913799285889, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8562843799591064, + "num_tokens": 59929779.0, + "step": 1644 + }, + { + "epoch": 0.30547818012999073, + "grad_norm": 1.438686490058899, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8577463030815125, + "num_tokens": 59968469.0, + "step": 1645 + }, + { + "epoch": 0.30566388115134635, + "grad_norm": 1.6084351539611816, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8542887568473816, + "num_tokens": 60004370.0, + "step": 1646 + }, + { + "epoch": 0.305849582172702, + "grad_norm": 1.6734284162521362, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8540260791778564, + "num_tokens": 60038198.0, + "step": 1647 + }, + { + "epoch": 0.3060352831940576, + "grad_norm": 1.6524430513381958, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8575627207756042, + "num_tokens": 60072563.0, + "step": 1648 + }, + { + "epoch": 0.30622098421541316, + "grad_norm": 1.6696423292160034, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8608996868133545, + "num_tokens": 60104756.0, + "step": 1649 + }, + { + "epoch": 0.3064066852367688, + "grad_norm": 1.576911211013794, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8579938411712646, + "num_tokens": 60138564.0, + "step": 1650 + }, + { + "epoch": 0.3065923862581244, + "grad_norm": 1.7706350088119507, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8545410633087158, + "num_tokens": 60169962.0, + "step": 1651 + }, + { + "epoch": 0.30677808727948, + "grad_norm": 1.5946515798568726, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8326818346977234, + "num_tokens": 60209305.0, + "step": 1652 + }, + { + "epoch": 0.30696378830083565, + "grad_norm": 1.8284215927124023, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8541842699050903, + "num_tokens": 60240564.0, + "step": 1653 + }, + { + "epoch": 0.30714948932219127, + "grad_norm": 1.645552396774292, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8591123819351196, + "num_tokens": 60276220.0, + "step": 1654 + }, + { + "epoch": 0.3073351903435469, + "grad_norm": 1.5763416290283203, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8588563799858093, + "num_tokens": 60310679.0, + "step": 1655 + }, + { + "epoch": 0.3075208913649025, + "grad_norm": 1.7848063707351685, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8467352986335754, + "num_tokens": 60343613.0, + "step": 1656 + }, + { + "epoch": 0.30770659238625814, + "grad_norm": 1.5761409997940063, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8551383018493652, + "num_tokens": 60377920.0, + "step": 1657 + }, + { + "epoch": 0.30789229340761376, + "grad_norm": 1.5480716228485107, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8609752058982849, + "num_tokens": 60423485.0, + "step": 1658 + }, + { + "epoch": 0.3080779944289694, + "grad_norm": 1.4298765659332275, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8611153364181519, + "num_tokens": 60466362.0, + "step": 1659 + }, + { + "epoch": 0.308263695450325, + "grad_norm": 1.5830880403518677, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8441252112388611, + "num_tokens": 60507179.0, + "step": 1660 + }, + { + "epoch": 0.30844939647168057, + "grad_norm": 1.584499716758728, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.854942798614502, + "num_tokens": 60542721.0, + "step": 1661 + }, + { + "epoch": 0.3086350974930362, + "grad_norm": 1.8296427726745605, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8496093153953552, + "num_tokens": 60572262.0, + "step": 1662 + }, + { + "epoch": 0.3088207985143918, + "grad_norm": 1.7234257459640503, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8645137548446655, + "num_tokens": 60602991.0, + "step": 1663 + }, + { + "epoch": 0.30900649953574744, + "grad_norm": 1.7166556119918823, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8614301681518555, + "num_tokens": 60635349.0, + "step": 1664 + }, + { + "epoch": 0.30919220055710306, + "grad_norm": 1.656976342201233, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8691465854644775, + "num_tokens": 60672597.0, + "step": 1665 + }, + { + "epoch": 0.3093779015784587, + "grad_norm": 1.572912573814392, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8582122921943665, + "num_tokens": 60710104.0, + "step": 1666 + }, + { + "epoch": 0.3095636025998143, + "grad_norm": 1.6967743635177612, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8559368848800659, + "num_tokens": 60745618.0, + "step": 1667 + }, + { + "epoch": 0.3097493036211699, + "grad_norm": 1.6818701028823853, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8559609055519104, + "num_tokens": 60779228.0, + "step": 1668 + }, + { + "epoch": 0.30993500464252555, + "grad_norm": 1.5983104705810547, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8524738550186157, + "num_tokens": 60818082.0, + "step": 1669 + }, + { + "epoch": 0.31012070566388117, + "grad_norm": 1.5017309188842773, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8696621656417847, + "num_tokens": 60855466.0, + "step": 1670 + }, + { + "epoch": 0.3103064066852368, + "grad_norm": 1.5375514030456543, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8419565558433533, + "num_tokens": 60892724.0, + "step": 1671 + }, + { + "epoch": 0.3104921077065924, + "grad_norm": 1.6281355619430542, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8596933484077454, + "num_tokens": 60926743.0, + "step": 1672 + }, + { + "epoch": 0.310677808727948, + "grad_norm": 3.208130121231079, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.840658962726593, + "num_tokens": 60962079.0, + "step": 1673 + }, + { + "epoch": 0.3108635097493036, + "grad_norm": 1.5592193603515625, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8618607521057129, + "num_tokens": 60998672.0, + "step": 1674 + }, + { + "epoch": 0.3110492107706592, + "grad_norm": 1.5122095346450806, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8487813472747803, + "num_tokens": 61037709.0, + "step": 1675 + }, + { + "epoch": 0.31123491179201485, + "grad_norm": 1.4670910835266113, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8472336530685425, + "num_tokens": 61079183.0, + "step": 1676 + }, + { + "epoch": 0.31142061281337047, + "grad_norm": 1.4645010232925415, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8781888484954834, + "num_tokens": 61112253.0, + "step": 1677 + }, + { + "epoch": 0.3116063138347261, + "grad_norm": 1.630496621131897, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8449193239212036, + "num_tokens": 61146002.0, + "step": 1678 + }, + { + "epoch": 0.3117920148560817, + "grad_norm": 1.5715281963348389, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8557097911834717, + "num_tokens": 61181638.0, + "step": 1679 + }, + { + "epoch": 0.31197771587743733, + "grad_norm": 1.5236363410949707, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.861622154712677, + "num_tokens": 61220212.0, + "step": 1680 + }, + { + "epoch": 0.31216341689879296, + "grad_norm": 1.5966012477874756, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8531595468521118, + "num_tokens": 61255240.0, + "step": 1681 + }, + { + "epoch": 0.3123491179201486, + "grad_norm": 1.4780977964401245, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8697299957275391, + "num_tokens": 61289392.0, + "step": 1682 + }, + { + "epoch": 0.3125348189415042, + "grad_norm": 1.4945006370544434, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8735324740409851, + "num_tokens": 61326956.0, + "step": 1683 + }, + { + "epoch": 0.3127205199628598, + "grad_norm": 1.3739045858383179, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8572235107421875, + "num_tokens": 61375298.0, + "step": 1684 + }, + { + "epoch": 0.3129062209842154, + "grad_norm": 1.5198529958724976, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8505557775497437, + "num_tokens": 61413954.0, + "step": 1685 + }, + { + "epoch": 0.313091922005571, + "grad_norm": 1.7752468585968018, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8395665884017944, + "num_tokens": 61446715.0, + "step": 1686 + }, + { + "epoch": 0.31327762302692663, + "grad_norm": 1.4840530157089233, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.872281014919281, + "num_tokens": 61484960.0, + "step": 1687 + }, + { + "epoch": 0.31346332404828225, + "grad_norm": 1.4563663005828857, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8546372056007385, + "num_tokens": 61525603.0, + "step": 1688 + }, + { + "epoch": 0.3136490250696379, + "grad_norm": 1.62294340133667, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8539186120033264, + "num_tokens": 61561639.0, + "step": 1689 + }, + { + "epoch": 0.3138347260909935, + "grad_norm": 1.7238258123397827, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.861466109752655, + "num_tokens": 61592658.0, + "step": 1690 + }, + { + "epoch": 0.3140204271123491, + "grad_norm": 1.6080697774887085, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8520944714546204, + "num_tokens": 61624829.0, + "step": 1691 + }, + { + "epoch": 0.31420612813370474, + "grad_norm": 1.6606168746948242, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8407249450683594, + "num_tokens": 61659001.0, + "step": 1692 + }, + { + "epoch": 0.31439182915506036, + "grad_norm": 1.530333399772644, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8410725593566895, + "num_tokens": 61697254.0, + "step": 1693 + }, + { + "epoch": 0.314577530176416, + "grad_norm": 1.5253249406814575, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8595876097679138, + "num_tokens": 61735642.0, + "step": 1694 + }, + { + "epoch": 0.3147632311977716, + "grad_norm": 1.6003135442733765, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8406040668487549, + "num_tokens": 61773089.0, + "step": 1695 + }, + { + "epoch": 0.31494893221912723, + "grad_norm": 1.4494080543518066, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8661258220672607, + "num_tokens": 61814910.0, + "step": 1696 + }, + { + "epoch": 0.3151346332404828, + "grad_norm": 1.569470763206482, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8670262098312378, + "num_tokens": 61847527.0, + "step": 1697 + }, + { + "epoch": 0.3153203342618384, + "grad_norm": 1.4784523248672485, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8684512376785278, + "num_tokens": 61886154.0, + "step": 1698 + }, + { + "epoch": 0.31550603528319404, + "grad_norm": 1.6986123323440552, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8403972387313843, + "num_tokens": 61920326.0, + "step": 1699 + }, + { + "epoch": 0.31569173630454966, + "grad_norm": 1.6500436067581177, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8573330044746399, + "num_tokens": 61959738.0, + "step": 1700 + }, + { + "epoch": 0.3158774373259053, + "grad_norm": 1.5827289819717407, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8810817003250122, + "num_tokens": 61995582.0, + "step": 1701 + }, + { + "epoch": 0.3160631383472609, + "grad_norm": 1.5860580205917358, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8640623688697815, + "num_tokens": 62030097.0, + "step": 1702 + }, + { + "epoch": 0.31624883936861653, + "grad_norm": 1.5745707750320435, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8535173535346985, + "num_tokens": 62065804.0, + "step": 1703 + }, + { + "epoch": 0.31643454038997215, + "grad_norm": 1.6548686027526855, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8603026866912842, + "num_tokens": 62096958.0, + "step": 1704 + }, + { + "epoch": 0.3166202414113278, + "grad_norm": 1.6775857210159302, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8401438593864441, + "num_tokens": 62132682.0, + "step": 1705 + }, + { + "epoch": 0.3168059424326834, + "grad_norm": 1.5742700099945068, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8503126502037048, + "num_tokens": 62172734.0, + "step": 1706 + }, + { + "epoch": 0.316991643454039, + "grad_norm": 1.7202173471450806, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8619428873062134, + "num_tokens": 62204646.0, + "step": 1707 + }, + { + "epoch": 0.31717734447539464, + "grad_norm": 1.3902060985565186, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8566498160362244, + "num_tokens": 62246755.0, + "step": 1708 + }, + { + "epoch": 0.3173630454967502, + "grad_norm": 1.577467918395996, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8722141981124878, + "num_tokens": 62279418.0, + "step": 1709 + }, + { + "epoch": 0.31754874651810583, + "grad_norm": 1.6391047239303589, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8602946996688843, + "num_tokens": 62314381.0, + "step": 1710 + }, + { + "epoch": 0.31773444753946145, + "grad_norm": 1.6815484762191772, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8456310629844666, + "num_tokens": 62348229.0, + "step": 1711 + }, + { + "epoch": 0.3179201485608171, + "grad_norm": 1.5905416011810303, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8357479572296143, + "num_tokens": 62388924.0, + "step": 1712 + }, + { + "epoch": 0.3181058495821727, + "grad_norm": 1.6201226711273193, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8749517202377319, + "num_tokens": 62422590.0, + "step": 1713 + }, + { + "epoch": 0.3182915506035283, + "grad_norm": 1.7242496013641357, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8441725373268127, + "num_tokens": 62457103.0, + "step": 1714 + }, + { + "epoch": 0.31847725162488394, + "grad_norm": 1.7863292694091797, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8588549494743347, + "num_tokens": 62487919.0, + "step": 1715 + }, + { + "epoch": 0.31866295264623956, + "grad_norm": 1.754996418952942, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8520016074180603, + "num_tokens": 62517244.0, + "step": 1716 + }, + { + "epoch": 0.3188486536675952, + "grad_norm": 1.4953690767288208, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8472373485565186, + "num_tokens": 62558363.0, + "step": 1717 + }, + { + "epoch": 0.3190343546889508, + "grad_norm": 1.5547045469284058, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8609148859977722, + "num_tokens": 62597389.0, + "step": 1718 + }, + { + "epoch": 0.3192200557103064, + "grad_norm": 1.635607361793518, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8693351745605469, + "num_tokens": 62631714.0, + "step": 1719 + }, + { + "epoch": 0.31940575673166205, + "grad_norm": 1.5013080835342407, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8477738499641418, + "num_tokens": 62672647.0, + "step": 1720 + }, + { + "epoch": 0.3195914577530176, + "grad_norm": 1.6653140783309937, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8459734916687012, + "num_tokens": 62706672.0, + "step": 1721 + }, + { + "epoch": 0.31977715877437324, + "grad_norm": 1.5444062948226929, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8570407629013062, + "num_tokens": 62746322.0, + "step": 1722 + }, + { + "epoch": 0.31996285979572886, + "grad_norm": 1.539432406425476, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8643577098846436, + "num_tokens": 62783219.0, + "step": 1723 + }, + { + "epoch": 0.3201485608170845, + "grad_norm": 1.6387661695480347, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8614360690116882, + "num_tokens": 62820182.0, + "step": 1724 + }, + { + "epoch": 0.3203342618384401, + "grad_norm": 1.5431933403015137, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8533220887184143, + "num_tokens": 62857437.0, + "step": 1725 + }, + { + "epoch": 0.3205199628597957, + "grad_norm": 1.4753440618515015, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8544170260429382, + "num_tokens": 62896312.0, + "step": 1726 + }, + { + "epoch": 0.32070566388115135, + "grad_norm": 1.5357756614685059, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8464619517326355, + "num_tokens": 62936164.0, + "step": 1727 + }, + { + "epoch": 0.32089136490250697, + "grad_norm": 1.500470757484436, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8783887624740601, + "num_tokens": 62969470.0, + "step": 1728 + }, + { + "epoch": 0.3210770659238626, + "grad_norm": 1.5980218648910522, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8555120825767517, + "num_tokens": 63006034.0, + "step": 1729 + }, + { + "epoch": 0.3212627669452182, + "grad_norm": 1.5126006603240967, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8628770112991333, + "num_tokens": 63044639.0, + "step": 1730 + }, + { + "epoch": 0.32144846796657384, + "grad_norm": 1.7461861371994019, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8449116945266724, + "num_tokens": 63076911.0, + "step": 1731 + }, + { + "epoch": 0.32163416898792946, + "grad_norm": 1.4450874328613281, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8619349002838135, + "num_tokens": 63116365.0, + "step": 1732 + }, + { + "epoch": 0.321819870009285, + "grad_norm": 1.5267870426177979, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8603766560554504, + "num_tokens": 63152884.0, + "step": 1733 + }, + { + "epoch": 0.32200557103064065, + "grad_norm": 1.519516944885254, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8635272979736328, + "num_tokens": 63189992.0, + "step": 1734 + }, + { + "epoch": 0.32219127205199627, + "grad_norm": 1.5357502698898315, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8623064756393433, + "num_tokens": 63226683.0, + "step": 1735 + }, + { + "epoch": 0.3223769730733519, + "grad_norm": 1.4963934421539307, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8591431975364685, + "num_tokens": 63264313.0, + "step": 1736 + }, + { + "epoch": 0.3225626740947075, + "grad_norm": 1.5776009559631348, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8582673668861389, + "num_tokens": 63299692.0, + "step": 1737 + }, + { + "epoch": 0.32274837511606314, + "grad_norm": 1.5772850513458252, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8632879257202148, + "num_tokens": 63334403.0, + "step": 1738 + }, + { + "epoch": 0.32293407613741876, + "grad_norm": 1.559951901435852, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8525442481040955, + "num_tokens": 63369761.0, + "step": 1739 + }, + { + "epoch": 0.3231197771587744, + "grad_norm": 1.7637574672698975, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8416932821273804, + "num_tokens": 63404439.0, + "step": 1740 + }, + { + "epoch": 0.32330547818013, + "grad_norm": 1.6351364850997925, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8567498326301575, + "num_tokens": 63443234.0, + "step": 1741 + }, + { + "epoch": 0.3234911792014856, + "grad_norm": 1.799759030342102, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8707743883132935, + "num_tokens": 63471982.0, + "step": 1742 + }, + { + "epoch": 0.32367688022284125, + "grad_norm": 1.4823683500289917, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8547005653381348, + "num_tokens": 63511928.0, + "step": 1743 + }, + { + "epoch": 0.32386258124419687, + "grad_norm": 1.6467514038085938, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.848786473274231, + "num_tokens": 63546230.0, + "step": 1744 + }, + { + "epoch": 0.32404828226555243, + "grad_norm": 1.6186093091964722, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8639835119247437, + "num_tokens": 63583758.0, + "step": 1745 + }, + { + "epoch": 0.32423398328690806, + "grad_norm": 1.4849638938903809, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8654192686080933, + "num_tokens": 63627021.0, + "step": 1746 + }, + { + "epoch": 0.3244196843082637, + "grad_norm": 1.3659350872039795, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8605669736862183, + "num_tokens": 63674279.0, + "step": 1747 + }, + { + "epoch": 0.3246053853296193, + "grad_norm": 1.6401726007461548, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8561826348304749, + "num_tokens": 63709028.0, + "step": 1748 + }, + { + "epoch": 0.3247910863509749, + "grad_norm": 1.560611367225647, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8453464508056641, + "num_tokens": 63745090.0, + "step": 1749 + }, + { + "epoch": 0.32497678737233054, + "grad_norm": 1.550553321838379, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8532793521881104, + "num_tokens": 63781362.0, + "step": 1750 + }, + { + "epoch": 0.32516248839368617, + "grad_norm": 1.512173056602478, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8559633493423462, + "num_tokens": 63819432.0, + "step": 1751 + }, + { + "epoch": 0.3253481894150418, + "grad_norm": 1.4881436824798584, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8593388199806213, + "num_tokens": 63859109.0, + "step": 1752 + }, + { + "epoch": 0.3255338904363974, + "grad_norm": 1.5183324813842773, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8660311698913574, + "num_tokens": 63898829.0, + "step": 1753 + }, + { + "epoch": 0.32571959145775303, + "grad_norm": 1.5109679698944092, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8673195242881775, + "num_tokens": 63935937.0, + "step": 1754 + }, + { + "epoch": 0.32590529247910865, + "grad_norm": 1.6258721351623535, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8683279752731323, + "num_tokens": 63967266.0, + "step": 1755 + }, + { + "epoch": 0.3260909935004643, + "grad_norm": 1.5918664932250977, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8669875860214233, + "num_tokens": 64004822.0, + "step": 1756 + }, + { + "epoch": 0.32627669452181984, + "grad_norm": 1.5602352619171143, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8569600582122803, + "num_tokens": 64042946.0, + "step": 1757 + }, + { + "epoch": 0.32646239554317547, + "grad_norm": 1.596591591835022, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.850631833076477, + "num_tokens": 64082972.0, + "step": 1758 + }, + { + "epoch": 0.3266480965645311, + "grad_norm": 1.6815346479415894, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8641766309738159, + "num_tokens": 64115331.0, + "step": 1759 + }, + { + "epoch": 0.3268337975858867, + "grad_norm": 1.5194602012634277, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.851195216178894, + "num_tokens": 64153392.0, + "step": 1760 + }, + { + "epoch": 0.32701949860724233, + "grad_norm": 1.5261257886886597, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8671026229858398, + "num_tokens": 64187610.0, + "step": 1761 + }, + { + "epoch": 0.32720519962859795, + "grad_norm": 1.5943971872329712, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8539437055587769, + "num_tokens": 64225412.0, + "step": 1762 + }, + { + "epoch": 0.3273909006499536, + "grad_norm": 1.6323107481002808, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8375604748725891, + "num_tokens": 64264898.0, + "step": 1763 + }, + { + "epoch": 0.3275766016713092, + "grad_norm": 1.4840087890625, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.853939950466156, + "num_tokens": 64304794.0, + "step": 1764 + }, + { + "epoch": 0.3277623026926648, + "grad_norm": 1.5823695659637451, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8500434160232544, + "num_tokens": 64343833.0, + "step": 1765 + }, + { + "epoch": 0.32794800371402044, + "grad_norm": 1.660359263420105, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8427621126174927, + "num_tokens": 64380885.0, + "step": 1766 + }, + { + "epoch": 0.32813370473537606, + "grad_norm": 1.5472605228424072, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8530656099319458, + "num_tokens": 64421180.0, + "step": 1767 + }, + { + "epoch": 0.3283194057567317, + "grad_norm": 1.5431580543518066, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8548673987388611, + "num_tokens": 64458750.0, + "step": 1768 + }, + { + "epoch": 0.32850510677808725, + "grad_norm": 1.544070839881897, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8620205521583557, + "num_tokens": 64495621.0, + "step": 1769 + }, + { + "epoch": 0.3286908077994429, + "grad_norm": 1.5585983991622925, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8685280084609985, + "num_tokens": 64537230.0, + "step": 1770 + }, + { + "epoch": 0.3288765088207985, + "grad_norm": 1.7365591526031494, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8659716248512268, + "num_tokens": 64569721.0, + "step": 1771 + }, + { + "epoch": 0.3290622098421541, + "grad_norm": 1.5985243320465088, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8488495349884033, + "num_tokens": 64607908.0, + "step": 1772 + }, + { + "epoch": 0.32924791086350974, + "grad_norm": 1.759108304977417, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8693224191665649, + "num_tokens": 64636897.0, + "step": 1773 + }, + { + "epoch": 0.32943361188486536, + "grad_norm": 1.6800856590270996, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8487614393234253, + "num_tokens": 64670789.0, + "step": 1774 + }, + { + "epoch": 0.329619312906221, + "grad_norm": 1.5791152715682983, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8675950765609741, + "num_tokens": 64708638.0, + "step": 1775 + }, + { + "epoch": 0.3298050139275766, + "grad_norm": 1.5691399574279785, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8618541955947876, + "num_tokens": 64747094.0, + "step": 1776 + }, + { + "epoch": 0.32999071494893223, + "grad_norm": 1.5018000602722168, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8617342114448547, + "num_tokens": 64782885.0, + "step": 1777 + }, + { + "epoch": 0.33017641597028785, + "grad_norm": 1.6614762544631958, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8591126203536987, + "num_tokens": 64818291.0, + "step": 1778 + }, + { + "epoch": 0.3303621169916435, + "grad_norm": 1.6685895919799805, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8428179025650024, + "num_tokens": 64856755.0, + "step": 1779 + }, + { + "epoch": 0.3305478180129991, + "grad_norm": 1.457953691482544, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.867910623550415, + "num_tokens": 64893557.0, + "step": 1780 + }, + { + "epoch": 0.33073351903435466, + "grad_norm": 1.570074439048767, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8690792322158813, + "num_tokens": 64925244.0, + "step": 1781 + }, + { + "epoch": 0.3309192200557103, + "grad_norm": 1.511635184288025, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8605538606643677, + "num_tokens": 64962463.0, + "step": 1782 + }, + { + "epoch": 0.3311049210770659, + "grad_norm": 1.4694650173187256, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8574512004852295, + "num_tokens": 65000765.0, + "step": 1783 + }, + { + "epoch": 0.3312906220984215, + "grad_norm": 1.631522536277771, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8602554202079773, + "num_tokens": 65035285.0, + "step": 1784 + }, + { + "epoch": 0.33147632311977715, + "grad_norm": 1.6678473949432373, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8583570718765259, + "num_tokens": 65066851.0, + "step": 1785 + }, + { + "epoch": 0.33166202414113277, + "grad_norm": 1.6341493129730225, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8498961925506592, + "num_tokens": 65100884.0, + "step": 1786 + }, + { + "epoch": 0.3318477251624884, + "grad_norm": 1.598167896270752, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8464855551719666, + "num_tokens": 65138968.0, + "step": 1787 + }, + { + "epoch": 0.332033426183844, + "grad_norm": 1.652787208557129, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8682588934898376, + "num_tokens": 65172106.0, + "step": 1788 + }, + { + "epoch": 0.33221912720519964, + "grad_norm": 1.6381852626800537, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8523452281951904, + "num_tokens": 65209491.0, + "step": 1789 + }, + { + "epoch": 0.33240482822655526, + "grad_norm": 1.559675693511963, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8371469974517822, + "num_tokens": 65249873.0, + "step": 1790 + }, + { + "epoch": 0.3325905292479109, + "grad_norm": 1.4279685020446777, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8532532453536987, + "num_tokens": 65294184.0, + "step": 1791 + }, + { + "epoch": 0.3327762302692665, + "grad_norm": 1.7214081287384033, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8542261123657227, + "num_tokens": 65322455.0, + "step": 1792 + }, + { + "epoch": 0.33296193129062207, + "grad_norm": 1.5533671379089355, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8641456365585327, + "num_tokens": 65360016.0, + "step": 1793 + }, + { + "epoch": 0.3331476323119777, + "grad_norm": 1.6287198066711426, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8508973717689514, + "num_tokens": 65395052.0, + "step": 1794 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.5520319938659668, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8480240106582642, + "num_tokens": 65429910.0, + "step": 1795 + }, + { + "epoch": 0.33351903435468894, + "grad_norm": 1.7431004047393799, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8483803272247314, + "num_tokens": 65464737.0, + "step": 1796 + }, + { + "epoch": 0.33370473537604456, + "grad_norm": 1.5224350690841675, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8619102239608765, + "num_tokens": 65500716.0, + "step": 1797 + }, + { + "epoch": 0.3338904363974002, + "grad_norm": 1.5224897861480713, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8709142208099365, + "num_tokens": 65539662.0, + "step": 1798 + }, + { + "epoch": 0.3340761374187558, + "grad_norm": 1.531528115272522, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8650451898574829, + "num_tokens": 65578002.0, + "step": 1799 + }, + { + "epoch": 0.3342618384401114, + "grad_norm": 1.5419597625732422, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8541062474250793, + "num_tokens": 65622433.0, + "step": 1800 + }, + { + "epoch": 0.33444753946146705, + "grad_norm": 1.5296977758407593, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8700169324874878, + "num_tokens": 65658676.0, + "step": 1801 + }, + { + "epoch": 0.33463324048282267, + "grad_norm": 1.5788532495498657, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8596957921981812, + "num_tokens": 65693458.0, + "step": 1802 + }, + { + "epoch": 0.3348189415041783, + "grad_norm": 1.4792264699935913, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8604021668434143, + "num_tokens": 65735541.0, + "step": 1803 + }, + { + "epoch": 0.3350046425255339, + "grad_norm": 1.483597755432129, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.854442298412323, + "num_tokens": 65774209.0, + "step": 1804 + }, + { + "epoch": 0.33519034354688954, + "grad_norm": 1.7318758964538574, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8657839894294739, + "num_tokens": 65806032.0, + "step": 1805 + }, + { + "epoch": 0.3353760445682451, + "grad_norm": 1.5612488985061646, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8560338020324707, + "num_tokens": 65843884.0, + "step": 1806 + }, + { + "epoch": 0.3355617455896007, + "grad_norm": 1.6585617065429688, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8541126847267151, + "num_tokens": 65875789.0, + "step": 1807 + }, + { + "epoch": 0.33574744661095635, + "grad_norm": 1.5206966400146484, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8543972969055176, + "num_tokens": 65912166.0, + "step": 1808 + }, + { + "epoch": 0.33593314763231197, + "grad_norm": 1.5159366130828857, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8688157796859741, + "num_tokens": 65949351.0, + "step": 1809 + }, + { + "epoch": 0.3361188486536676, + "grad_norm": 1.4909061193466187, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.858599066734314, + "num_tokens": 65987099.0, + "step": 1810 + }, + { + "epoch": 0.3363045496750232, + "grad_norm": 1.6956661939620972, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8374798893928528, + "num_tokens": 66024983.0, + "step": 1811 + }, + { + "epoch": 0.33649025069637883, + "grad_norm": 1.6095865964889526, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8619101047515869, + "num_tokens": 66059456.0, + "step": 1812 + }, + { + "epoch": 0.33667595171773446, + "grad_norm": 1.5561177730560303, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8529989123344421, + "num_tokens": 66097746.0, + "step": 1813 + }, + { + "epoch": 0.3368616527390901, + "grad_norm": 1.4944713115692139, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.860024094581604, + "num_tokens": 66138487.0, + "step": 1814 + }, + { + "epoch": 0.3370473537604457, + "grad_norm": 1.780350685119629, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8433867692947388, + "num_tokens": 66171789.0, + "step": 1815 + }, + { + "epoch": 0.3372330547818013, + "grad_norm": 1.48026442527771, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.868837833404541, + "num_tokens": 66208272.0, + "step": 1816 + }, + { + "epoch": 0.33741875580315694, + "grad_norm": 1.5897667407989502, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8556697368621826, + "num_tokens": 66243996.0, + "step": 1817 + }, + { + "epoch": 0.3376044568245125, + "grad_norm": 1.4509073495864868, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8677215576171875, + "num_tokens": 66284449.0, + "step": 1818 + }, + { + "epoch": 0.33779015784586813, + "grad_norm": 1.5028128623962402, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8670040965080261, + "num_tokens": 66321446.0, + "step": 1819 + }, + { + "epoch": 0.33797585886722376, + "grad_norm": 1.4668582677841187, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8562089204788208, + "num_tokens": 66362829.0, + "step": 1820 + }, + { + "epoch": 0.3381615598885794, + "grad_norm": 1.5371425151824951, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.856697142124176, + "num_tokens": 66402202.0, + "step": 1821 + }, + { + "epoch": 0.338347260909935, + "grad_norm": 1.5373657941818237, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8493288159370422, + "num_tokens": 66438511.0, + "step": 1822 + }, + { + "epoch": 0.3385329619312906, + "grad_norm": 1.4212487936019897, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8713243007659912, + "num_tokens": 66478189.0, + "step": 1823 + }, + { + "epoch": 0.33871866295264624, + "grad_norm": 1.6450859308242798, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8682101964950562, + "num_tokens": 66507783.0, + "step": 1824 + }, + { + "epoch": 0.33890436397400187, + "grad_norm": 1.4569661617279053, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8594982028007507, + "num_tokens": 66547426.0, + "step": 1825 + }, + { + "epoch": 0.3390900649953575, + "grad_norm": 1.5082945823669434, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8607178926467896, + "num_tokens": 66587647.0, + "step": 1826 + }, + { + "epoch": 0.3392757660167131, + "grad_norm": 1.4530256986618042, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8570376038551331, + "num_tokens": 66628899.0, + "step": 1827 + }, + { + "epoch": 0.33946146703806873, + "grad_norm": 1.5870933532714844, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8505024909973145, + "num_tokens": 66666519.0, + "step": 1828 + }, + { + "epoch": 0.33964716805942435, + "grad_norm": 1.5238851308822632, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8450621962547302, + "num_tokens": 66705504.0, + "step": 1829 + }, + { + "epoch": 0.3398328690807799, + "grad_norm": 1.6284301280975342, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8582278490066528, + "num_tokens": 66738577.0, + "step": 1830 + }, + { + "epoch": 0.34001857010213554, + "grad_norm": 1.4706138372421265, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8467310667037964, + "num_tokens": 66780302.0, + "step": 1831 + }, + { + "epoch": 0.34020427112349116, + "grad_norm": 1.658010482788086, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8496435880661011, + "num_tokens": 66814793.0, + "step": 1832 + }, + { + "epoch": 0.3403899721448468, + "grad_norm": 1.6514091491699219, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8583447933197021, + "num_tokens": 66849049.0, + "step": 1833 + }, + { + "epoch": 0.3405756731662024, + "grad_norm": 1.5022894144058228, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8616669178009033, + "num_tokens": 66888510.0, + "step": 1834 + }, + { + "epoch": 0.34076137418755803, + "grad_norm": 1.5860563516616821, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8554387092590332, + "num_tokens": 66922495.0, + "step": 1835 + }, + { + "epoch": 0.34094707520891365, + "grad_norm": 1.6970659494400024, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8593040704727173, + "num_tokens": 66955767.0, + "step": 1836 + }, + { + "epoch": 0.3411327762302693, + "grad_norm": 1.6170676946640015, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8489919900894165, + "num_tokens": 66990906.0, + "step": 1837 + }, + { + "epoch": 0.3413184772516249, + "grad_norm": 1.46125066280365, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8409109115600586, + "num_tokens": 67031295.0, + "step": 1838 + }, + { + "epoch": 0.3415041782729805, + "grad_norm": 1.6315228939056396, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8556516766548157, + "num_tokens": 67066421.0, + "step": 1839 + }, + { + "epoch": 0.34168987929433614, + "grad_norm": 1.5873886346817017, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8613249659538269, + "num_tokens": 67101352.0, + "step": 1840 + }, + { + "epoch": 0.34187558031569176, + "grad_norm": 1.6436189413070679, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.848222017288208, + "num_tokens": 67136764.0, + "step": 1841 + }, + { + "epoch": 0.34206128133704733, + "grad_norm": 1.444921612739563, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8634376525878906, + "num_tokens": 67177164.0, + "step": 1842 + }, + { + "epoch": 0.34224698235840295, + "grad_norm": 1.7239975929260254, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8568257093429565, + "num_tokens": 67206805.0, + "step": 1843 + }, + { + "epoch": 0.3424326833797586, + "grad_norm": 1.5134894847869873, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.863562822341919, + "num_tokens": 67243672.0, + "step": 1844 + }, + { + "epoch": 0.3426183844011142, + "grad_norm": 1.5482292175292969, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8667103052139282, + "num_tokens": 67278383.0, + "step": 1845 + }, + { + "epoch": 0.3428040854224698, + "grad_norm": 1.5958036184310913, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8468747138977051, + "num_tokens": 67319345.0, + "step": 1846 + }, + { + "epoch": 0.34298978644382544, + "grad_norm": 1.464961051940918, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.865342378616333, + "num_tokens": 67356884.0, + "step": 1847 + }, + { + "epoch": 0.34317548746518106, + "grad_norm": 1.4946398735046387, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8539095520973206, + "num_tokens": 67397314.0, + "step": 1848 + }, + { + "epoch": 0.3433611884865367, + "grad_norm": 1.5795578956604004, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8585941791534424, + "num_tokens": 67434021.0, + "step": 1849 + }, + { + "epoch": 0.3435468895078923, + "grad_norm": 1.4001517295837402, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8640075922012329, + "num_tokens": 67480216.0, + "step": 1850 + }, + { + "epoch": 0.34373259052924793, + "grad_norm": 1.5724400281906128, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8358719944953918, + "num_tokens": 67516742.0, + "step": 1851 + }, + { + "epoch": 0.34391829155060355, + "grad_norm": 1.6795622110366821, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8576401472091675, + "num_tokens": 67551112.0, + "step": 1852 + }, + { + "epoch": 0.34410399257195917, + "grad_norm": 1.5586458444595337, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8529989123344421, + "num_tokens": 67587470.0, + "step": 1853 + }, + { + "epoch": 0.34428969359331474, + "grad_norm": 1.5142078399658203, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8589233160018921, + "num_tokens": 67627702.0, + "step": 1854 + }, + { + "epoch": 0.34447539461467036, + "grad_norm": 1.565683126449585, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8498520851135254, + "num_tokens": 67664729.0, + "step": 1855 + }, + { + "epoch": 0.344661095636026, + "grad_norm": 1.6080186367034912, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.858015239238739, + "num_tokens": 67699441.0, + "step": 1856 + }, + { + "epoch": 0.3448467966573816, + "grad_norm": 1.593895435333252, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8635745048522949, + "num_tokens": 67735013.0, + "step": 1857 + }, + { + "epoch": 0.3450324976787372, + "grad_norm": 1.544623613357544, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8718847036361694, + "num_tokens": 67768575.0, + "step": 1858 + }, + { + "epoch": 0.34521819870009285, + "grad_norm": 1.640788197517395, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8734266757965088, + "num_tokens": 67798576.0, + "step": 1859 + }, + { + "epoch": 0.34540389972144847, + "grad_norm": 1.566786289215088, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8667891025543213, + "num_tokens": 67831828.0, + "step": 1860 + }, + { + "epoch": 0.3455896007428041, + "grad_norm": 1.5858707427978516, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8598699569702148, + "num_tokens": 67865593.0, + "step": 1861 + }, + { + "epoch": 0.3457753017641597, + "grad_norm": 1.659200668334961, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8504445552825928, + "num_tokens": 67902336.0, + "step": 1862 + }, + { + "epoch": 0.34596100278551534, + "grad_norm": 1.7470042705535889, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8532411456108093, + "num_tokens": 67935748.0, + "step": 1863 + }, + { + "epoch": 0.34614670380687096, + "grad_norm": 1.4436169862747192, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8568945527076721, + "num_tokens": 67976098.0, + "step": 1864 + }, + { + "epoch": 0.3463324048282266, + "grad_norm": 1.5983843803405762, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8616392612457275, + "num_tokens": 68011353.0, + "step": 1865 + }, + { + "epoch": 0.34651810584958215, + "grad_norm": 1.542015790939331, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8635695576667786, + "num_tokens": 68044236.0, + "step": 1866 + }, + { + "epoch": 0.34670380687093777, + "grad_norm": 1.5931440591812134, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8507915735244751, + "num_tokens": 68082047.0, + "step": 1867 + }, + { + "epoch": 0.3468895078922934, + "grad_norm": 1.4727513790130615, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8754779100418091, + "num_tokens": 68120542.0, + "step": 1868 + }, + { + "epoch": 0.347075208913649, + "grad_norm": 1.4784590005874634, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8718779683113098, + "num_tokens": 68155163.0, + "step": 1869 + }, + { + "epoch": 0.34726090993500464, + "grad_norm": 1.5778881311416626, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8447883725166321, + "num_tokens": 68192678.0, + "step": 1870 + }, + { + "epoch": 0.34744661095636026, + "grad_norm": 1.5963876247406006, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8599473237991333, + "num_tokens": 68226521.0, + "step": 1871 + }, + { + "epoch": 0.3476323119777159, + "grad_norm": 1.5694993734359741, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8564333915710449, + "num_tokens": 68261657.0, + "step": 1872 + }, + { + "epoch": 0.3478180129990715, + "grad_norm": 1.614081621170044, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8625587821006775, + "num_tokens": 68298454.0, + "step": 1873 + }, + { + "epoch": 0.3480037140204271, + "grad_norm": 1.5102952718734741, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8581907153129578, + "num_tokens": 68336627.0, + "step": 1874 + }, + { + "epoch": 0.34818941504178275, + "grad_norm": 1.7954646348953247, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8563140034675598, + "num_tokens": 68367021.0, + "step": 1875 + }, + { + "epoch": 0.34837511606313837, + "grad_norm": 1.6704683303833008, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8703144788742065, + "num_tokens": 68398602.0, + "step": 1876 + }, + { + "epoch": 0.348560817084494, + "grad_norm": 1.6454452276229858, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8622078895568848, + "num_tokens": 68431609.0, + "step": 1877 + }, + { + "epoch": 0.34874651810584956, + "grad_norm": 1.691210150718689, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8460086584091187, + "num_tokens": 68467239.0, + "step": 1878 + }, + { + "epoch": 0.3489322191272052, + "grad_norm": 1.5533428192138672, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8536436557769775, + "num_tokens": 68509475.0, + "step": 1879 + }, + { + "epoch": 0.3491179201485608, + "grad_norm": 1.7262568473815918, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8647453784942627, + "num_tokens": 68538263.0, + "step": 1880 + }, + { + "epoch": 0.3493036211699164, + "grad_norm": 1.5414133071899414, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8609973192214966, + "num_tokens": 68573883.0, + "step": 1881 + }, + { + "epoch": 0.34948932219127204, + "grad_norm": 1.6428208351135254, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8698561191558838, + "num_tokens": 68609814.0, + "step": 1882 + }, + { + "epoch": 0.34967502321262767, + "grad_norm": 1.8104913234710693, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8581341505050659, + "num_tokens": 68639642.0, + "step": 1883 + }, + { + "epoch": 0.3498607242339833, + "grad_norm": 1.4723777770996094, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8644318580627441, + "num_tokens": 68677441.0, + "step": 1884 + }, + { + "epoch": 0.3500464252553389, + "grad_norm": 1.4367436170578003, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8776534795761108, + "num_tokens": 68714080.0, + "step": 1885 + }, + { + "epoch": 0.35023212627669453, + "grad_norm": 1.5345391035079956, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.864266574382782, + "num_tokens": 68748925.0, + "step": 1886 + }, + { + "epoch": 0.35041782729805016, + "grad_norm": 1.5740089416503906, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8654234409332275, + "num_tokens": 68781771.0, + "step": 1887 + }, + { + "epoch": 0.3506035283194058, + "grad_norm": 1.5190242528915405, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8484438061714172, + "num_tokens": 68821162.0, + "step": 1888 + }, + { + "epoch": 0.3507892293407614, + "grad_norm": 1.4925878047943115, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.861798882484436, + "num_tokens": 68860447.0, + "step": 1889 + }, + { + "epoch": 0.35097493036211697, + "grad_norm": 1.6828539371490479, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8445902466773987, + "num_tokens": 68896159.0, + "step": 1890 + }, + { + "epoch": 0.3511606313834726, + "grad_norm": 1.502134084701538, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8842071294784546, + "num_tokens": 68931425.0, + "step": 1891 + }, + { + "epoch": 0.3513463324048282, + "grad_norm": 1.7063219547271729, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8655242919921875, + "num_tokens": 68964478.0, + "step": 1892 + }, + { + "epoch": 0.35153203342618383, + "grad_norm": 1.5360478162765503, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8609909415245056, + "num_tokens": 69002400.0, + "step": 1893 + }, + { + "epoch": 0.35171773444753945, + "grad_norm": 1.645289421081543, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8541271090507507, + "num_tokens": 69039349.0, + "step": 1894 + }, + { + "epoch": 0.3519034354688951, + "grad_norm": 1.5187184810638428, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8681796193122864, + "num_tokens": 69075746.0, + "step": 1895 + }, + { + "epoch": 0.3520891364902507, + "grad_norm": 1.5400176048278809, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8677488565444946, + "num_tokens": 69113235.0, + "step": 1896 + }, + { + "epoch": 0.3522748375116063, + "grad_norm": 1.5754107236862183, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8730037212371826, + "num_tokens": 69147525.0, + "step": 1897 + }, + { + "epoch": 0.35246053853296194, + "grad_norm": 1.4909031391143799, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8565669059753418, + "num_tokens": 69189027.0, + "step": 1898 + }, + { + "epoch": 0.35264623955431756, + "grad_norm": 1.435218334197998, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8528836965560913, + "num_tokens": 69234121.0, + "step": 1899 + }, + { + "epoch": 0.3528319405756732, + "grad_norm": 1.6210154294967651, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8474977612495422, + "num_tokens": 69271021.0, + "step": 1900 + }, + { + "epoch": 0.3530176415970288, + "grad_norm": 1.5236482620239258, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8604587316513062, + "num_tokens": 69307371.0, + "step": 1901 + }, + { + "epoch": 0.3532033426183844, + "grad_norm": 1.395906686782837, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8632291555404663, + "num_tokens": 69346697.0, + "step": 1902 + }, + { + "epoch": 0.35338904363974, + "grad_norm": 1.607025384902954, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8691086769104004, + "num_tokens": 69380393.0, + "step": 1903 + }, + { + "epoch": 0.3535747446610956, + "grad_norm": 1.5616167783737183, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8647134304046631, + "num_tokens": 69416349.0, + "step": 1904 + }, + { + "epoch": 0.35376044568245124, + "grad_norm": 1.4766689538955688, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8692907094955444, + "num_tokens": 69454151.0, + "step": 1905 + }, + { + "epoch": 0.35394614670380686, + "grad_norm": 1.5873836278915405, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8649806976318359, + "num_tokens": 69487052.0, + "step": 1906 + }, + { + "epoch": 0.3541318477251625, + "grad_norm": 1.6051193475723267, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.867255687713623, + "num_tokens": 69519729.0, + "step": 1907 + }, + { + "epoch": 0.3543175487465181, + "grad_norm": 1.524599552154541, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8616821765899658, + "num_tokens": 69555965.0, + "step": 1908 + }, + { + "epoch": 0.35450324976787373, + "grad_norm": 1.4407594203948975, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8595306873321533, + "num_tokens": 69595587.0, + "step": 1909 + }, + { + "epoch": 0.35468895078922935, + "grad_norm": 1.5965285301208496, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.864220917224884, + "num_tokens": 69630030.0, + "step": 1910 + }, + { + "epoch": 0.354874651810585, + "grad_norm": 1.5484662055969238, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8658243417739868, + "num_tokens": 69669720.0, + "step": 1911 + }, + { + "epoch": 0.3550603528319406, + "grad_norm": 1.615240216255188, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8385143280029297, + "num_tokens": 69707930.0, + "step": 1912 + }, + { + "epoch": 0.3552460538532962, + "grad_norm": 1.577754259109497, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8594491481781006, + "num_tokens": 69746391.0, + "step": 1913 + }, + { + "epoch": 0.3554317548746518, + "grad_norm": 1.774223804473877, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8513190746307373, + "num_tokens": 69778146.0, + "step": 1914 + }, + { + "epoch": 0.3556174558960074, + "grad_norm": 1.6410125494003296, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8493281602859497, + "num_tokens": 69812914.0, + "step": 1915 + }, + { + "epoch": 0.35580315691736303, + "grad_norm": 1.5939717292785645, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8499147295951843, + "num_tokens": 69851037.0, + "step": 1916 + }, + { + "epoch": 0.35598885793871865, + "grad_norm": 1.62191903591156, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8574842214584351, + "num_tokens": 69886060.0, + "step": 1917 + }, + { + "epoch": 0.3561745589600743, + "grad_norm": 1.6888304948806763, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8512516617774963, + "num_tokens": 69919340.0, + "step": 1918 + }, + { + "epoch": 0.3563602599814299, + "grad_norm": 1.6296037435531616, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8500938415527344, + "num_tokens": 69951978.0, + "step": 1919 + }, + { + "epoch": 0.3565459610027855, + "grad_norm": 1.574837327003479, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8550461530685425, + "num_tokens": 69986375.0, + "step": 1920 + }, + { + "epoch": 0.35673166202414114, + "grad_norm": 1.426872968673706, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8669897317886353, + "num_tokens": 70025868.0, + "step": 1921 + }, + { + "epoch": 0.35691736304549676, + "grad_norm": 1.5607645511627197, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8571703433990479, + "num_tokens": 70064652.0, + "step": 1922 + }, + { + "epoch": 0.3571030640668524, + "grad_norm": 1.573514699935913, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8527724146842957, + "num_tokens": 70101399.0, + "step": 1923 + }, + { + "epoch": 0.357288765088208, + "grad_norm": 1.5386614799499512, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8732396364212036, + "num_tokens": 70133715.0, + "step": 1924 + }, + { + "epoch": 0.3574744661095636, + "grad_norm": 1.5537700653076172, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8409644365310669, + "num_tokens": 70172808.0, + "step": 1925 + }, + { + "epoch": 0.3576601671309192, + "grad_norm": 1.4195321798324585, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8733859658241272, + "num_tokens": 70213437.0, + "step": 1926 + }, + { + "epoch": 0.3578458681522748, + "grad_norm": 1.5710283517837524, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8539668321609497, + "num_tokens": 70251271.0, + "step": 1927 + }, + { + "epoch": 0.35803156917363044, + "grad_norm": 1.6817294359207153, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.86163729429245, + "num_tokens": 70284515.0, + "step": 1928 + }, + { + "epoch": 0.35821727019498606, + "grad_norm": 1.430184006690979, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.851791262626648, + "num_tokens": 70323926.0, + "step": 1929 + }, + { + "epoch": 0.3584029712163417, + "grad_norm": 1.5513737201690674, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8692468404769897, + "num_tokens": 70359942.0, + "step": 1930 + }, + { + "epoch": 0.3585886722376973, + "grad_norm": 1.5808937549591064, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.859135627746582, + "num_tokens": 70395168.0, + "step": 1931 + }, + { + "epoch": 0.3587743732590529, + "grad_norm": 1.4868727922439575, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8652256727218628, + "num_tokens": 70435790.0, + "step": 1932 + }, + { + "epoch": 0.35896007428040855, + "grad_norm": 1.7609502077102661, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.863570511341095, + "num_tokens": 70465881.0, + "step": 1933 + }, + { + "epoch": 0.35914577530176417, + "grad_norm": 1.6117236614227295, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8586418628692627, + "num_tokens": 70502593.0, + "step": 1934 + }, + { + "epoch": 0.3593314763231198, + "grad_norm": 1.4926594495773315, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.865517258644104, + "num_tokens": 70541323.0, + "step": 1935 + }, + { + "epoch": 0.3595171773444754, + "grad_norm": 1.4750250577926636, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8719162940979004, + "num_tokens": 70579161.0, + "step": 1936 + }, + { + "epoch": 0.35970287836583104, + "grad_norm": 1.542781114578247, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8556277751922607, + "num_tokens": 70616582.0, + "step": 1937 + }, + { + "epoch": 0.3598885793871866, + "grad_norm": 1.6017369031906128, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8692247867584229, + "num_tokens": 70653313.0, + "step": 1938 + }, + { + "epoch": 0.3600742804085422, + "grad_norm": 1.6651287078857422, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.852375864982605, + "num_tokens": 70686987.0, + "step": 1939 + }, + { + "epoch": 0.36025998142989785, + "grad_norm": 1.6071360111236572, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8581337332725525, + "num_tokens": 70721464.0, + "step": 1940 + }, + { + "epoch": 0.36044568245125347, + "grad_norm": 1.5537594556808472, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8682912588119507, + "num_tokens": 70754655.0, + "step": 1941 + }, + { + "epoch": 0.3606313834726091, + "grad_norm": 1.4873671531677246, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8789500594139099, + "num_tokens": 70789751.0, + "step": 1942 + }, + { + "epoch": 0.3608170844939647, + "grad_norm": 1.506253957748413, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.865774393081665, + "num_tokens": 70825959.0, + "step": 1943 + }, + { + "epoch": 0.36100278551532033, + "grad_norm": 1.6106398105621338, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8655455112457275, + "num_tokens": 70858839.0, + "step": 1944 + }, + { + "epoch": 0.36118848653667596, + "grad_norm": 1.6618735790252686, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8407430052757263, + "num_tokens": 70891787.0, + "step": 1945 + }, + { + "epoch": 0.3613741875580316, + "grad_norm": 1.640663981437683, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8514987826347351, + "num_tokens": 70926895.0, + "step": 1946 + }, + { + "epoch": 0.3615598885793872, + "grad_norm": 1.7902733087539673, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8436441421508789, + "num_tokens": 70958798.0, + "step": 1947 + }, + { + "epoch": 0.3617455896007428, + "grad_norm": 1.5871084928512573, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8496887683868408, + "num_tokens": 70993502.0, + "step": 1948 + }, + { + "epoch": 0.36193129062209844, + "grad_norm": 1.4975693225860596, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8770219087600708, + "num_tokens": 71028723.0, + "step": 1949 + }, + { + "epoch": 0.362116991643454, + "grad_norm": 1.5552303791046143, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8587784767150879, + "num_tokens": 71066592.0, + "step": 1950 + }, + { + "epoch": 0.36230269266480963, + "grad_norm": 1.6317367553710938, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8596838712692261, + "num_tokens": 71104665.0, + "step": 1951 + }, + { + "epoch": 0.36248839368616526, + "grad_norm": 1.5100375413894653, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8698073625564575, + "num_tokens": 71143904.0, + "step": 1952 + }, + { + "epoch": 0.3626740947075209, + "grad_norm": 1.5465525388717651, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8568826913833618, + "num_tokens": 71181689.0, + "step": 1953 + }, + { + "epoch": 0.3628597957288765, + "grad_norm": 1.588783860206604, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8627378940582275, + "num_tokens": 71216805.0, + "step": 1954 + }, + { + "epoch": 0.3630454967502321, + "grad_norm": 1.6025310754776, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8591274619102478, + "num_tokens": 71254159.0, + "step": 1955 + }, + { + "epoch": 0.36323119777158774, + "grad_norm": 1.5577393770217896, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8750661611557007, + "num_tokens": 71291135.0, + "step": 1956 + }, + { + "epoch": 0.36341689879294337, + "grad_norm": 1.5904706716537476, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8577672243118286, + "num_tokens": 71327880.0, + "step": 1957 + }, + { + "epoch": 0.363602599814299, + "grad_norm": 1.6722153425216675, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8423035144805908, + "num_tokens": 71361960.0, + "step": 1958 + }, + { + "epoch": 0.3637883008356546, + "grad_norm": 1.5207507610321045, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8655036687850952, + "num_tokens": 71399949.0, + "step": 1959 + }, + { + "epoch": 0.36397400185701023, + "grad_norm": 1.6705759763717651, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8538144826889038, + "num_tokens": 71435999.0, + "step": 1960 + }, + { + "epoch": 0.36415970287836585, + "grad_norm": 1.6844921112060547, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8539567589759827, + "num_tokens": 71468779.0, + "step": 1961 + }, + { + "epoch": 0.3643454038997215, + "grad_norm": 1.767594337463379, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8491586446762085, + "num_tokens": 71501591.0, + "step": 1962 + }, + { + "epoch": 0.36453110492107704, + "grad_norm": 1.662215232849121, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8564985990524292, + "num_tokens": 71538126.0, + "step": 1963 + }, + { + "epoch": 0.36471680594243266, + "grad_norm": 1.4914114475250244, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8717412352561951, + "num_tokens": 71577944.0, + "step": 1964 + }, + { + "epoch": 0.3649025069637883, + "grad_norm": 1.6291677951812744, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8682717680931091, + "num_tokens": 71610802.0, + "step": 1965 + }, + { + "epoch": 0.3650882079851439, + "grad_norm": 1.6696470975875854, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8446990847587585, + "num_tokens": 71648702.0, + "step": 1966 + }, + { + "epoch": 0.36527390900649953, + "grad_norm": 1.5590426921844482, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8672850131988525, + "num_tokens": 71687780.0, + "step": 1967 + }, + { + "epoch": 0.36545961002785515, + "grad_norm": 1.5529630184173584, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8580224514007568, + "num_tokens": 71729798.0, + "step": 1968 + }, + { + "epoch": 0.3656453110492108, + "grad_norm": 1.5416048765182495, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8605806231498718, + "num_tokens": 71768670.0, + "step": 1969 + }, + { + "epoch": 0.3658310120705664, + "grad_norm": 1.669979214668274, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8353817462921143, + "num_tokens": 71805250.0, + "step": 1970 + }, + { + "epoch": 0.366016713091922, + "grad_norm": 1.5831005573272705, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8591707348823547, + "num_tokens": 71840517.0, + "step": 1971 + }, + { + "epoch": 0.36620241411327764, + "grad_norm": 1.6437641382217407, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8589173555374146, + "num_tokens": 71879406.0, + "step": 1972 + }, + { + "epoch": 0.36638811513463326, + "grad_norm": 1.553935170173645, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8722628355026245, + "num_tokens": 71915708.0, + "step": 1973 + }, + { + "epoch": 0.3665738161559889, + "grad_norm": 1.6447111368179321, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8666288256645203, + "num_tokens": 71946021.0, + "step": 1974 + }, + { + "epoch": 0.36675951717734445, + "grad_norm": 1.5453730821609497, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8548601865768433, + "num_tokens": 71985049.0, + "step": 1975 + }, + { + "epoch": 0.3669452181987001, + "grad_norm": 1.6103073358535767, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8517823815345764, + "num_tokens": 72023294.0, + "step": 1976 + }, + { + "epoch": 0.3671309192200557, + "grad_norm": 1.7757631540298462, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.858707070350647, + "num_tokens": 72056890.0, + "step": 1977 + }, + { + "epoch": 0.3673166202414113, + "grad_norm": 1.728832721710205, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8573204278945923, + "num_tokens": 72091331.0, + "step": 1978 + }, + { + "epoch": 0.36750232126276694, + "grad_norm": 1.6336971521377563, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8484628200531006, + "num_tokens": 72128353.0, + "step": 1979 + }, + { + "epoch": 0.36768802228412256, + "grad_norm": 1.7168045043945312, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.861667811870575, + "num_tokens": 72161443.0, + "step": 1980 + }, + { + "epoch": 0.3678737233054782, + "grad_norm": 1.6854679584503174, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8612104654312134, + "num_tokens": 72197132.0, + "step": 1981 + }, + { + "epoch": 0.3680594243268338, + "grad_norm": 1.5642871856689453, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8520108461380005, + "num_tokens": 72232237.0, + "step": 1982 + }, + { + "epoch": 0.36824512534818943, + "grad_norm": 1.618131160736084, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8609193563461304, + "num_tokens": 72270735.0, + "step": 1983 + }, + { + "epoch": 0.36843082636954505, + "grad_norm": 1.4474310874938965, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8552525043487549, + "num_tokens": 72316592.0, + "step": 1984 + }, + { + "epoch": 0.3686165273909007, + "grad_norm": 1.5986988544464111, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8483119010925293, + "num_tokens": 72355537.0, + "step": 1985 + }, + { + "epoch": 0.3688022284122563, + "grad_norm": 1.5491001605987549, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8520963191986084, + "num_tokens": 72392954.0, + "step": 1986 + }, + { + "epoch": 0.36898792943361186, + "grad_norm": 1.8549226522445679, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8387392163276672, + "num_tokens": 72422783.0, + "step": 1987 + }, + { + "epoch": 0.3691736304549675, + "grad_norm": 1.5730500221252441, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8527209162712097, + "num_tokens": 72461196.0, + "step": 1988 + }, + { + "epoch": 0.3693593314763231, + "grad_norm": 1.7326117753982544, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8537092208862305, + "num_tokens": 72496895.0, + "step": 1989 + }, + { + "epoch": 0.3695450324976787, + "grad_norm": 1.6563282012939453, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.869096577167511, + "num_tokens": 72531471.0, + "step": 1990 + }, + { + "epoch": 0.36973073351903435, + "grad_norm": 1.5373706817626953, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8599928021430969, + "num_tokens": 72571018.0, + "step": 1991 + }, + { + "epoch": 0.36991643454038997, + "grad_norm": 1.785144567489624, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8590191602706909, + "num_tokens": 72602581.0, + "step": 1992 + }, + { + "epoch": 0.3701021355617456, + "grad_norm": 1.45164155960083, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8479388356208801, + "num_tokens": 72643620.0, + "step": 1993 + }, + { + "epoch": 0.3702878365831012, + "grad_norm": 1.5330086946487427, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8531805276870728, + "num_tokens": 72680614.0, + "step": 1994 + }, + { + "epoch": 0.37047353760445684, + "grad_norm": 1.4958674907684326, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8696919083595276, + "num_tokens": 72721738.0, + "step": 1995 + }, + { + "epoch": 0.37065923862581246, + "grad_norm": 1.471205472946167, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8557040691375732, + "num_tokens": 72761851.0, + "step": 1996 + }, + { + "epoch": 0.3708449396471681, + "grad_norm": 1.619400978088379, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.855892539024353, + "num_tokens": 72796752.0, + "step": 1997 + }, + { + "epoch": 0.3710306406685237, + "grad_norm": 1.5737130641937256, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.852426290512085, + "num_tokens": 72836323.0, + "step": 1998 + }, + { + "epoch": 0.37121634168987927, + "grad_norm": 1.5656907558441162, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8522306680679321, + "num_tokens": 72872917.0, + "step": 1999 + }, + { + "epoch": 0.3714020427112349, + "grad_norm": 1.4867204427719116, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8741235733032227, + "num_tokens": 72908801.0, + "step": 2000 + }, + { + "epoch": 0.3715877437325905, + "grad_norm": 1.673649549484253, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8631926774978638, + "num_tokens": 72942266.0, + "step": 2001 + }, + { + "epoch": 0.37177344475394614, + "grad_norm": 1.4623297452926636, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8769292831420898, + "num_tokens": 72978276.0, + "step": 2002 + }, + { + "epoch": 0.37195914577530176, + "grad_norm": 1.5137768983840942, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8564730882644653, + "num_tokens": 73015554.0, + "step": 2003 + }, + { + "epoch": 0.3721448467966574, + "grad_norm": 1.53400456905365, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.869425356388092, + "num_tokens": 73053000.0, + "step": 2004 + }, + { + "epoch": 0.372330547818013, + "grad_norm": 1.5807597637176514, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8591878414154053, + "num_tokens": 73088921.0, + "step": 2005 + }, + { + "epoch": 0.3725162488393686, + "grad_norm": 1.5320920944213867, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8673655986785889, + "num_tokens": 73125695.0, + "step": 2006 + }, + { + "epoch": 0.37270194986072425, + "grad_norm": 1.749194860458374, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8610893487930298, + "num_tokens": 73157103.0, + "step": 2007 + }, + { + "epoch": 0.37288765088207987, + "grad_norm": 1.6733413934707642, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8711553812026978, + "num_tokens": 73187329.0, + "step": 2008 + }, + { + "epoch": 0.3730733519034355, + "grad_norm": 1.573682427406311, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8713541030883789, + "num_tokens": 73218043.0, + "step": 2009 + }, + { + "epoch": 0.3732590529247911, + "grad_norm": 1.446347713470459, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8857677578926086, + "num_tokens": 73255448.0, + "step": 2010 + }, + { + "epoch": 0.3734447539461467, + "grad_norm": 1.637334942817688, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8702322840690613, + "num_tokens": 73292421.0, + "step": 2011 + }, + { + "epoch": 0.3736304549675023, + "grad_norm": 1.6780332326889038, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8690491914749146, + "num_tokens": 73325308.0, + "step": 2012 + }, + { + "epoch": 0.3738161559888579, + "grad_norm": 1.6273528337478638, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8563463091850281, + "num_tokens": 73361748.0, + "step": 2013 + }, + { + "epoch": 0.37400185701021355, + "grad_norm": 1.639892578125, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8611348867416382, + "num_tokens": 73394555.0, + "step": 2014 + }, + { + "epoch": 0.37418755803156917, + "grad_norm": 1.6137144565582275, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8544780611991882, + "num_tokens": 73431623.0, + "step": 2015 + }, + { + "epoch": 0.3743732590529248, + "grad_norm": 1.6476739645004272, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8599808216094971, + "num_tokens": 73464289.0, + "step": 2016 + }, + { + "epoch": 0.3745589600742804, + "grad_norm": 1.7621921300888062, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8506255149841309, + "num_tokens": 73497993.0, + "step": 2017 + }, + { + "epoch": 0.37474466109563603, + "grad_norm": 1.4494507312774658, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8624169826507568, + "num_tokens": 73539120.0, + "step": 2018 + }, + { + "epoch": 0.37493036211699166, + "grad_norm": 1.6532241106033325, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8578982353210449, + "num_tokens": 73571783.0, + "step": 2019 + }, + { + "epoch": 0.3751160631383473, + "grad_norm": 1.508686900138855, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8714656829833984, + "num_tokens": 73607765.0, + "step": 2020 + }, + { + "epoch": 0.3753017641597029, + "grad_norm": 1.4883711338043213, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8783043622970581, + "num_tokens": 73642588.0, + "step": 2021 + }, + { + "epoch": 0.3754874651810585, + "grad_norm": 1.5885241031646729, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.858492910861969, + "num_tokens": 73684586.0, + "step": 2022 + }, + { + "epoch": 0.3756731662024141, + "grad_norm": 1.6749556064605713, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8519525527954102, + "num_tokens": 73719277.0, + "step": 2023 + }, + { + "epoch": 0.3758588672237697, + "grad_norm": 1.460903525352478, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8515603542327881, + "num_tokens": 73762557.0, + "step": 2024 + }, + { + "epoch": 0.37604456824512533, + "grad_norm": 1.7745622396469116, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8500467538833618, + "num_tokens": 73791717.0, + "step": 2025 + }, + { + "epoch": 0.37623026926648095, + "grad_norm": 1.6759898662567139, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8476718068122864, + "num_tokens": 73825546.0, + "step": 2026 + }, + { + "epoch": 0.3764159702878366, + "grad_norm": 1.6503214836120605, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8668195009231567, + "num_tokens": 73857288.0, + "step": 2027 + }, + { + "epoch": 0.3766016713091922, + "grad_norm": 1.484963297843933, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8547539114952087, + "num_tokens": 73895506.0, + "step": 2028 + }, + { + "epoch": 0.3767873723305478, + "grad_norm": 1.539069414138794, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8629456758499146, + "num_tokens": 73932764.0, + "step": 2029 + }, + { + "epoch": 0.37697307335190344, + "grad_norm": 1.6865289211273193, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8489662408828735, + "num_tokens": 73967312.0, + "step": 2030 + }, + { + "epoch": 0.37715877437325906, + "grad_norm": 1.5089281797409058, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8534051179885864, + "num_tokens": 74003748.0, + "step": 2031 + }, + { + "epoch": 0.3773444753946147, + "grad_norm": 1.5152393579483032, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8663371205329895, + "num_tokens": 74037838.0, + "step": 2032 + }, + { + "epoch": 0.3775301764159703, + "grad_norm": 1.8037211894989014, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8424453139305115, + "num_tokens": 74069147.0, + "step": 2033 + }, + { + "epoch": 0.37771587743732593, + "grad_norm": 1.6631494760513306, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8588419556617737, + "num_tokens": 74103324.0, + "step": 2034 + }, + { + "epoch": 0.3779015784586815, + "grad_norm": 1.423862099647522, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8562120199203491, + "num_tokens": 74144412.0, + "step": 2035 + }, + { + "epoch": 0.3780872794800371, + "grad_norm": 1.6220223903656006, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8607892394065857, + "num_tokens": 74179702.0, + "step": 2036 + }, + { + "epoch": 0.37827298050139274, + "grad_norm": 1.513784646987915, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8671491146087646, + "num_tokens": 74218319.0, + "step": 2037 + }, + { + "epoch": 0.37845868152274836, + "grad_norm": 1.572270154953003, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8600797653198242, + "num_tokens": 74255461.0, + "step": 2038 + }, + { + "epoch": 0.378644382544104, + "grad_norm": 1.4868378639221191, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8466222286224365, + "num_tokens": 74295600.0, + "step": 2039 + }, + { + "epoch": 0.3788300835654596, + "grad_norm": 1.4539892673492432, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8543175458908081, + "num_tokens": 74337832.0, + "step": 2040 + }, + { + "epoch": 0.37901578458681523, + "grad_norm": 1.5577130317687988, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8507003784179688, + "num_tokens": 74378351.0, + "step": 2041 + }, + { + "epoch": 0.37920148560817085, + "grad_norm": 1.5326919555664062, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8646986484527588, + "num_tokens": 74414456.0, + "step": 2042 + }, + { + "epoch": 0.3793871866295265, + "grad_norm": 1.514660120010376, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8670399785041809, + "num_tokens": 74453565.0, + "step": 2043 + }, + { + "epoch": 0.3795728876508821, + "grad_norm": 1.6182163953781128, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8598557114601135, + "num_tokens": 74489128.0, + "step": 2044 + }, + { + "epoch": 0.3797585886722377, + "grad_norm": 1.4690535068511963, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8682860136032104, + "num_tokens": 74528698.0, + "step": 2045 + }, + { + "epoch": 0.37994428969359334, + "grad_norm": 1.4626520872116089, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.867095410823822, + "num_tokens": 74569931.0, + "step": 2046 + }, + { + "epoch": 0.3801299907149489, + "grad_norm": 1.5757317543029785, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8706991672515869, + "num_tokens": 74603781.0, + "step": 2047 + }, + { + "epoch": 0.38031569173630453, + "grad_norm": 1.6297123432159424, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8549314737319946, + "num_tokens": 74636287.0, + "step": 2048 + }, + { + "epoch": 0.38050139275766015, + "grad_norm": 1.7814615964889526, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8375905156135559, + "num_tokens": 74670774.0, + "step": 2049 + }, + { + "epoch": 0.3806870937790158, + "grad_norm": 1.4556853771209717, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8614695072174072, + "num_tokens": 74713757.0, + "step": 2050 + }, + { + "epoch": 0.3808727948003714, + "grad_norm": 1.6104413270950317, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.861094057559967, + "num_tokens": 74748596.0, + "step": 2051 + }, + { + "epoch": 0.381058495821727, + "grad_norm": 1.3631659746170044, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8643227815628052, + "num_tokens": 74789790.0, + "step": 2052 + }, + { + "epoch": 0.38124419684308264, + "grad_norm": 1.6033730506896973, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8472009301185608, + "num_tokens": 74824976.0, + "step": 2053 + }, + { + "epoch": 0.38142989786443826, + "grad_norm": 1.6189085245132446, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8650396466255188, + "num_tokens": 74856608.0, + "step": 2054 + }, + { + "epoch": 0.3816155988857939, + "grad_norm": 1.4516983032226562, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8574004769325256, + "num_tokens": 74896580.0, + "step": 2055 + }, + { + "epoch": 0.3818012999071495, + "grad_norm": 1.637281894683838, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8476405143737793, + "num_tokens": 74930570.0, + "step": 2056 + }, + { + "epoch": 0.3819870009285051, + "grad_norm": 1.6359678506851196, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8617146611213684, + "num_tokens": 74965243.0, + "step": 2057 + }, + { + "epoch": 0.38217270194986075, + "grad_norm": 1.5746331214904785, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8684943914413452, + "num_tokens": 74997430.0, + "step": 2058 + }, + { + "epoch": 0.3823584029712163, + "grad_norm": 1.8504338264465332, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8509467840194702, + "num_tokens": 75028158.0, + "step": 2059 + }, + { + "epoch": 0.38254410399257194, + "grad_norm": 1.7024744749069214, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8659675121307373, + "num_tokens": 75059201.0, + "step": 2060 + }, + { + "epoch": 0.38272980501392756, + "grad_norm": 1.6405839920043945, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8687196373939514, + "num_tokens": 75088647.0, + "step": 2061 + }, + { + "epoch": 0.3829155060352832, + "grad_norm": 1.4893014430999756, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8684777021408081, + "num_tokens": 75128922.0, + "step": 2062 + }, + { + "epoch": 0.3831012070566388, + "grad_norm": 1.557870626449585, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8621957898139954, + "num_tokens": 75166395.0, + "step": 2063 + }, + { + "epoch": 0.3832869080779944, + "grad_norm": 1.5221796035766602, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8669426441192627, + "num_tokens": 75206454.0, + "step": 2064 + }, + { + "epoch": 0.38347260909935005, + "grad_norm": 1.5476421117782593, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8517032861709595, + "num_tokens": 75245155.0, + "step": 2065 + }, + { + "epoch": 0.38365831012070567, + "grad_norm": 1.564477562904358, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8582831621170044, + "num_tokens": 75282155.0, + "step": 2066 + }, + { + "epoch": 0.3838440111420613, + "grad_norm": 1.5502636432647705, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8585764169692993, + "num_tokens": 75316061.0, + "step": 2067 + }, + { + "epoch": 0.3840297121634169, + "grad_norm": 1.6115831136703491, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8702000975608826, + "num_tokens": 75351007.0, + "step": 2068 + }, + { + "epoch": 0.38421541318477254, + "grad_norm": 1.527815341949463, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8536422252655029, + "num_tokens": 75390726.0, + "step": 2069 + }, + { + "epoch": 0.38440111420612816, + "grad_norm": 1.498895525932312, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8609799146652222, + "num_tokens": 75424782.0, + "step": 2070 + }, + { + "epoch": 0.3845868152274837, + "grad_norm": 1.7217291593551636, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.850555419921875, + "num_tokens": 75460387.0, + "step": 2071 + }, + { + "epoch": 0.38477251624883935, + "grad_norm": 1.5518763065338135, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8542319536209106, + "num_tokens": 75498394.0, + "step": 2072 + }, + { + "epoch": 0.38495821727019497, + "grad_norm": 1.6900397539138794, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8490646481513977, + "num_tokens": 75530092.0, + "step": 2073 + }, + { + "epoch": 0.3851439182915506, + "grad_norm": 1.5047852993011475, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8541509509086609, + "num_tokens": 75567149.0, + "step": 2074 + }, + { + "epoch": 0.3853296193129062, + "grad_norm": 1.5788246393203735, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8659948706626892, + "num_tokens": 75602311.0, + "step": 2075 + }, + { + "epoch": 0.38551532033426184, + "grad_norm": 1.6848210096359253, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8429632186889648, + "num_tokens": 75639674.0, + "step": 2076 + }, + { + "epoch": 0.38570102135561746, + "grad_norm": 1.6148148775100708, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8397362232208252, + "num_tokens": 75681490.0, + "step": 2077 + }, + { + "epoch": 0.3858867223769731, + "grad_norm": 1.5461525917053223, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8520693778991699, + "num_tokens": 75719967.0, + "step": 2078 + }, + { + "epoch": 0.3860724233983287, + "grad_norm": 1.5640311241149902, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8705466985702515, + "num_tokens": 75751812.0, + "step": 2079 + }, + { + "epoch": 0.3862581244196843, + "grad_norm": 1.6293950080871582, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8529026508331299, + "num_tokens": 75790753.0, + "step": 2080 + }, + { + "epoch": 0.38644382544103995, + "grad_norm": 1.4716709852218628, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8675117492675781, + "num_tokens": 75829101.0, + "step": 2081 + }, + { + "epoch": 0.38662952646239557, + "grad_norm": 1.684978723526001, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8538814187049866, + "num_tokens": 75867208.0, + "step": 2082 + }, + { + "epoch": 0.38681522748375113, + "grad_norm": 1.5746948719024658, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8747112154960632, + "num_tokens": 75901032.0, + "step": 2083 + }, + { + "epoch": 0.38700092850510676, + "grad_norm": 1.4680476188659668, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8603557348251343, + "num_tokens": 75942650.0, + "step": 2084 + }, + { + "epoch": 0.3871866295264624, + "grad_norm": 1.5033738613128662, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8561418652534485, + "num_tokens": 75979878.0, + "step": 2085 + }, + { + "epoch": 0.387372330547818, + "grad_norm": 1.4992408752441406, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8595237135887146, + "num_tokens": 76018818.0, + "step": 2086 + }, + { + "epoch": 0.3875580315691736, + "grad_norm": 1.6164257526397705, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8657497763633728, + "num_tokens": 76051104.0, + "step": 2087 + }, + { + "epoch": 0.38774373259052924, + "grad_norm": 1.7045207023620605, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8734674453735352, + "num_tokens": 76080824.0, + "step": 2088 + }, + { + "epoch": 0.38792943361188487, + "grad_norm": 1.510633111000061, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8757399320602417, + "num_tokens": 76117978.0, + "step": 2089 + }, + { + "epoch": 0.3881151346332405, + "grad_norm": 1.6314910650253296, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8726325631141663, + "num_tokens": 76151756.0, + "step": 2090 + }, + { + "epoch": 0.3883008356545961, + "grad_norm": 1.5642669200897217, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8594719767570496, + "num_tokens": 76190629.0, + "step": 2091 + }, + { + "epoch": 0.38848653667595173, + "grad_norm": 1.540462851524353, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8707556128501892, + "num_tokens": 76225827.0, + "step": 2092 + }, + { + "epoch": 0.38867223769730735, + "grad_norm": 1.4726896286010742, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8614761829376221, + "num_tokens": 76267980.0, + "step": 2093 + }, + { + "epoch": 0.388857938718663, + "grad_norm": 1.5902915000915527, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8763649463653564, + "num_tokens": 76301092.0, + "step": 2094 + }, + { + "epoch": 0.38904363974001854, + "grad_norm": 1.438362717628479, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8638594150543213, + "num_tokens": 76338758.0, + "step": 2095 + }, + { + "epoch": 0.38922934076137417, + "grad_norm": 1.6365716457366943, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8613379001617432, + "num_tokens": 76372623.0, + "step": 2096 + }, + { + "epoch": 0.3894150417827298, + "grad_norm": 1.5452265739440918, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8738305568695068, + "num_tokens": 76409631.0, + "step": 2097 + }, + { + "epoch": 0.3896007428040854, + "grad_norm": 1.6343421936035156, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8549087643623352, + "num_tokens": 76446139.0, + "step": 2098 + }, + { + "epoch": 0.38978644382544103, + "grad_norm": 1.4879339933395386, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8724768161773682, + "num_tokens": 76482956.0, + "step": 2099 + }, + { + "epoch": 0.38997214484679665, + "grad_norm": 1.45903480052948, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8631148934364319, + "num_tokens": 76522028.0, + "step": 2100 + }, + { + "epoch": 0.3901578458681523, + "grad_norm": 1.5587893724441528, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8612239956855774, + "num_tokens": 76558359.0, + "step": 2101 + }, + { + "epoch": 0.3903435468895079, + "grad_norm": 1.590607762336731, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8494238257408142, + "num_tokens": 76596640.0, + "step": 2102 + }, + { + "epoch": 0.3905292479108635, + "grad_norm": 1.6123589277267456, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8656509518623352, + "num_tokens": 76629535.0, + "step": 2103 + }, + { + "epoch": 0.39071494893221914, + "grad_norm": 1.6228078603744507, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8475220203399658, + "num_tokens": 76663764.0, + "step": 2104 + }, + { + "epoch": 0.39090064995357476, + "grad_norm": 1.4748203754425049, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8696336150169373, + "num_tokens": 76701924.0, + "step": 2105 + }, + { + "epoch": 0.3910863509749304, + "grad_norm": 1.5799341201782227, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.859068751335144, + "num_tokens": 76737766.0, + "step": 2106 + }, + { + "epoch": 0.39127205199628595, + "grad_norm": 1.7099188566207886, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.855125904083252, + "num_tokens": 76767912.0, + "step": 2107 + }, + { + "epoch": 0.3914577530176416, + "grad_norm": 1.6983344554901123, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8537431955337524, + "num_tokens": 76802608.0, + "step": 2108 + }, + { + "epoch": 0.3916434540389972, + "grad_norm": 1.4757434129714966, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8676090240478516, + "num_tokens": 76845921.0, + "step": 2109 + }, + { + "epoch": 0.3918291550603528, + "grad_norm": 1.6293774843215942, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8599401712417603, + "num_tokens": 76877478.0, + "step": 2110 + }, + { + "epoch": 0.39201485608170844, + "grad_norm": 1.5950769186019897, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8603658080101013, + "num_tokens": 76912549.0, + "step": 2111 + }, + { + "epoch": 0.39220055710306406, + "grad_norm": 1.60074782371521, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8645040988922119, + "num_tokens": 76945206.0, + "step": 2112 + }, + { + "epoch": 0.3923862581244197, + "grad_norm": 1.5139408111572266, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8618617057800293, + "num_tokens": 76983026.0, + "step": 2113 + }, + { + "epoch": 0.3925719591457753, + "grad_norm": 1.4632225036621094, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8715965151786804, + "num_tokens": 77023969.0, + "step": 2114 + }, + { + "epoch": 0.39275766016713093, + "grad_norm": 1.660752773284912, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8803418874740601, + "num_tokens": 77052291.0, + "step": 2115 + }, + { + "epoch": 0.39294336118848655, + "grad_norm": 1.4940565824508667, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.86662358045578, + "num_tokens": 77089943.0, + "step": 2116 + }, + { + "epoch": 0.3931290622098422, + "grad_norm": 1.765507459640503, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8654723763465881, + "num_tokens": 77117304.0, + "step": 2117 + }, + { + "epoch": 0.3933147632311978, + "grad_norm": 1.564276933670044, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8553241491317749, + "num_tokens": 77154797.0, + "step": 2118 + }, + { + "epoch": 0.39350046425255336, + "grad_norm": 1.5105831623077393, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8531873822212219, + "num_tokens": 77194818.0, + "step": 2119 + }, + { + "epoch": 0.393686165273909, + "grad_norm": 1.6853761672973633, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8462037444114685, + "num_tokens": 77224913.0, + "step": 2120 + }, + { + "epoch": 0.3938718662952646, + "grad_norm": 1.5540846586227417, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8606420159339905, + "num_tokens": 77259454.0, + "step": 2121 + }, + { + "epoch": 0.3940575673166202, + "grad_norm": 1.5812221765518188, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8570114374160767, + "num_tokens": 77295361.0, + "step": 2122 + }, + { + "epoch": 0.39424326833797585, + "grad_norm": 1.6617186069488525, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8554447889328003, + "num_tokens": 77330648.0, + "step": 2123 + }, + { + "epoch": 0.39442896935933147, + "grad_norm": 1.5197566747665405, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8735577464103699, + "num_tokens": 77367909.0, + "step": 2124 + }, + { + "epoch": 0.3946146703806871, + "grad_norm": 1.476292610168457, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8701666593551636, + "num_tokens": 77405996.0, + "step": 2125 + }, + { + "epoch": 0.3948003714020427, + "grad_norm": 1.5013608932495117, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8683800101280212, + "num_tokens": 77441968.0, + "step": 2126 + }, + { + "epoch": 0.39498607242339834, + "grad_norm": 1.4051231145858765, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8631855249404907, + "num_tokens": 77485223.0, + "step": 2127 + }, + { + "epoch": 0.39517177344475396, + "grad_norm": 1.5015267133712769, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8618117570877075, + "num_tokens": 77525844.0, + "step": 2128 + }, + { + "epoch": 0.3953574744661096, + "grad_norm": 1.462470293045044, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8625925779342651, + "num_tokens": 77561948.0, + "step": 2129 + }, + { + "epoch": 0.3955431754874652, + "grad_norm": 1.525713562965393, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8425229787826538, + "num_tokens": 77603216.0, + "step": 2130 + }, + { + "epoch": 0.3957288765088208, + "grad_norm": 1.6048598289489746, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.860264241695404, + "num_tokens": 77636339.0, + "step": 2131 + }, + { + "epoch": 0.3959145775301764, + "grad_norm": 1.393142819404602, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8680121898651123, + "num_tokens": 77678295.0, + "step": 2132 + }, + { + "epoch": 0.396100278551532, + "grad_norm": 1.554248332977295, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8720760941505432, + "num_tokens": 77710964.0, + "step": 2133 + }, + { + "epoch": 0.39628597957288764, + "grad_norm": 1.6885472536087036, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8574077486991882, + "num_tokens": 77742293.0, + "step": 2134 + }, + { + "epoch": 0.39647168059424326, + "grad_norm": 1.5711923837661743, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8585034608840942, + "num_tokens": 77777594.0, + "step": 2135 + }, + { + "epoch": 0.3966573816155989, + "grad_norm": 1.5946931838989258, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8659435510635376, + "num_tokens": 77815677.0, + "step": 2136 + }, + { + "epoch": 0.3968430826369545, + "grad_norm": 1.5794001817703247, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8617311716079712, + "num_tokens": 77856924.0, + "step": 2137 + }, + { + "epoch": 0.3970287836583101, + "grad_norm": 1.6383649110794067, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8519350290298462, + "num_tokens": 77893155.0, + "step": 2138 + }, + { + "epoch": 0.39721448467966575, + "grad_norm": 1.7141157388687134, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8592725992202759, + "num_tokens": 77924963.0, + "step": 2139 + }, + { + "epoch": 0.39740018570102137, + "grad_norm": 1.678582787513733, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8528668880462646, + "num_tokens": 77961260.0, + "step": 2140 + }, + { + "epoch": 0.397585886722377, + "grad_norm": 1.5578898191452026, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8601189851760864, + "num_tokens": 77994934.0, + "step": 2141 + }, + { + "epoch": 0.3977715877437326, + "grad_norm": 1.500012993812561, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8524036407470703, + "num_tokens": 78037203.0, + "step": 2142 + }, + { + "epoch": 0.39795728876508824, + "grad_norm": 1.5462275743484497, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8739394545555115, + "num_tokens": 78073574.0, + "step": 2143 + }, + { + "epoch": 0.3981429897864438, + "grad_norm": 1.5264660120010376, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8586857318878174, + "num_tokens": 78111726.0, + "step": 2144 + }, + { + "epoch": 0.3983286908077994, + "grad_norm": 1.4771640300750732, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8719127178192139, + "num_tokens": 78150791.0, + "step": 2145 + }, + { + "epoch": 0.39851439182915505, + "grad_norm": 1.5465501546859741, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8743846416473389, + "num_tokens": 78185814.0, + "step": 2146 + }, + { + "epoch": 0.39870009285051067, + "grad_norm": 1.6591764688491821, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8563790321350098, + "num_tokens": 78218422.0, + "step": 2147 + }, + { + "epoch": 0.3988857938718663, + "grad_norm": 1.4938212633132935, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8660226464271545, + "num_tokens": 78259693.0, + "step": 2148 + }, + { + "epoch": 0.3990714948932219, + "grad_norm": 1.3952388763427734, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8647789359092712, + "num_tokens": 78300202.0, + "step": 2149 + }, + { + "epoch": 0.39925719591457753, + "grad_norm": 1.4797863960266113, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8501182794570923, + "num_tokens": 78341216.0, + "step": 2150 + }, + { + "epoch": 0.39944289693593316, + "grad_norm": 1.5696871280670166, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8643351793289185, + "num_tokens": 78377737.0, + "step": 2151 + }, + { + "epoch": 0.3996285979572888, + "grad_norm": 1.5139986276626587, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8506275415420532, + "num_tokens": 78418311.0, + "step": 2152 + }, + { + "epoch": 0.3998142989786444, + "grad_norm": 1.682673692703247, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8474627137184143, + "num_tokens": 78452535.0, + "step": 2153 + }, + { + "epoch": 0.4, + "grad_norm": 1.4874026775360107, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8559085130691528, + "num_tokens": 78494863.0, + "step": 2154 + }, + { + "epoch": 0.40018570102135564, + "grad_norm": 1.6055247783660889, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8510020971298218, + "num_tokens": 78530539.0, + "step": 2155 + }, + { + "epoch": 0.4003714020427112, + "grad_norm": 1.6967966556549072, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8569766879081726, + "num_tokens": 78562579.0, + "step": 2156 + }, + { + "epoch": 0.40055710306406683, + "grad_norm": 1.402444839477539, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.858401894569397, + "num_tokens": 78603519.0, + "step": 2157 + }, + { + "epoch": 0.40074280408542245, + "grad_norm": 1.4731402397155762, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8723198175430298, + "num_tokens": 78640770.0, + "step": 2158 + }, + { + "epoch": 0.4009285051067781, + "grad_norm": 1.4996757507324219, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8580049276351929, + "num_tokens": 78677505.0, + "step": 2159 + }, + { + "epoch": 0.4011142061281337, + "grad_norm": 1.5512561798095703, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8626535534858704, + "num_tokens": 78714706.0, + "step": 2160 + }, + { + "epoch": 0.4012999071494893, + "grad_norm": 1.5453356504440308, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8598837852478027, + "num_tokens": 78748245.0, + "step": 2161 + }, + { + "epoch": 0.40148560817084494, + "grad_norm": 1.477583885192871, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8704279661178589, + "num_tokens": 78785444.0, + "step": 2162 + }, + { + "epoch": 0.40167130919220057, + "grad_norm": 1.6235427856445312, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8589389324188232, + "num_tokens": 78818158.0, + "step": 2163 + }, + { + "epoch": 0.4018570102135562, + "grad_norm": 1.4918971061706543, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8568853735923767, + "num_tokens": 78856309.0, + "step": 2164 + }, + { + "epoch": 0.4020427112349118, + "grad_norm": 1.5754306316375732, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8608396649360657, + "num_tokens": 78899183.0, + "step": 2165 + }, + { + "epoch": 0.40222841225626743, + "grad_norm": 1.5705934762954712, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8659520745277405, + "num_tokens": 78932456.0, + "step": 2166 + }, + { + "epoch": 0.40241411327762305, + "grad_norm": 1.5649971961975098, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8542051315307617, + "num_tokens": 78970986.0, + "step": 2167 + }, + { + "epoch": 0.4025998142989786, + "grad_norm": 1.3409600257873535, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8580650687217712, + "num_tokens": 79016748.0, + "step": 2168 + }, + { + "epoch": 0.40278551532033424, + "grad_norm": 1.541305661201477, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8761751055717468, + "num_tokens": 79054148.0, + "step": 2169 + }, + { + "epoch": 0.40297121634168986, + "grad_norm": 1.5495853424072266, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8676139116287231, + "num_tokens": 79090961.0, + "step": 2170 + }, + { + "epoch": 0.4031569173630455, + "grad_norm": 1.5015407800674438, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8614842891693115, + "num_tokens": 79127706.0, + "step": 2171 + }, + { + "epoch": 0.4033426183844011, + "grad_norm": 1.603220820426941, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8722278475761414, + "num_tokens": 79161312.0, + "step": 2172 + }, + { + "epoch": 0.40352831940575673, + "grad_norm": 1.7621448040008545, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8623300790786743, + "num_tokens": 79190747.0, + "step": 2173 + }, + { + "epoch": 0.40371402042711235, + "grad_norm": 1.5062514543533325, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8725651502609253, + "num_tokens": 79227236.0, + "step": 2174 + }, + { + "epoch": 0.403899721448468, + "grad_norm": 1.5788700580596924, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8586344718933105, + "num_tokens": 79263888.0, + "step": 2175 + }, + { + "epoch": 0.4040854224698236, + "grad_norm": 1.6879178285598755, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8458695411682129, + "num_tokens": 79298051.0, + "step": 2176 + }, + { + "epoch": 0.4042711234911792, + "grad_norm": 1.5183597803115845, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8592442274093628, + "num_tokens": 79335883.0, + "step": 2177 + }, + { + "epoch": 0.40445682451253484, + "grad_norm": 1.6913466453552246, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8563694953918457, + "num_tokens": 79369397.0, + "step": 2178 + }, + { + "epoch": 0.40464252553389046, + "grad_norm": 1.7459220886230469, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8684104681015015, + "num_tokens": 79401720.0, + "step": 2179 + }, + { + "epoch": 0.40482822655524603, + "grad_norm": 1.4759916067123413, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.859919548034668, + "num_tokens": 79446291.0, + "step": 2180 + }, + { + "epoch": 0.40501392757660165, + "grad_norm": 1.531184434890747, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8481783866882324, + "num_tokens": 79488086.0, + "step": 2181 + }, + { + "epoch": 0.4051996285979573, + "grad_norm": 1.9154269695281982, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8488850593566895, + "num_tokens": 79517400.0, + "step": 2182 + }, + { + "epoch": 0.4053853296193129, + "grad_norm": 1.6029220819473267, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8650156259536743, + "num_tokens": 79555171.0, + "step": 2183 + }, + { + "epoch": 0.4055710306406685, + "grad_norm": 1.6810901165008545, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8464639782905579, + "num_tokens": 79592259.0, + "step": 2184 + }, + { + "epoch": 0.40575673166202414, + "grad_norm": 1.444881796836853, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.870025634765625, + "num_tokens": 79633796.0, + "step": 2185 + }, + { + "epoch": 0.40594243268337976, + "grad_norm": 1.6037856340408325, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8654477000236511, + "num_tokens": 79668787.0, + "step": 2186 + }, + { + "epoch": 0.4061281337047354, + "grad_norm": 1.5466227531433105, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8669329881668091, + "num_tokens": 79708914.0, + "step": 2187 + }, + { + "epoch": 0.406313834726091, + "grad_norm": 1.6819618940353394, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8721923232078552, + "num_tokens": 79741611.0, + "step": 2188 + }, + { + "epoch": 0.4064995357474466, + "grad_norm": 1.3771922588348389, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8679466247558594, + "num_tokens": 79785273.0, + "step": 2189 + }, + { + "epoch": 0.40668523676880225, + "grad_norm": 1.5517137050628662, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8565380573272705, + "num_tokens": 79822519.0, + "step": 2190 + }, + { + "epoch": 0.40687093779015787, + "grad_norm": 1.5379676818847656, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8724422454833984, + "num_tokens": 79861867.0, + "step": 2191 + }, + { + "epoch": 0.40705663881151344, + "grad_norm": 1.608625054359436, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8515111207962036, + "num_tokens": 79897606.0, + "step": 2192 + }, + { + "epoch": 0.40724233983286906, + "grad_norm": 1.6689667701721191, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8522184491157532, + "num_tokens": 79931888.0, + "step": 2193 + }, + { + "epoch": 0.4074280408542247, + "grad_norm": 1.6283196210861206, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8539901971817017, + "num_tokens": 79967161.0, + "step": 2194 + }, + { + "epoch": 0.4076137418755803, + "grad_norm": 1.6699848175048828, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8500942587852478, + "num_tokens": 80003842.0, + "step": 2195 + }, + { + "epoch": 0.4077994428969359, + "grad_norm": 1.488645315170288, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8695875406265259, + "num_tokens": 80040981.0, + "step": 2196 + }, + { + "epoch": 0.40798514391829155, + "grad_norm": 1.5406724214553833, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8608821034431458, + "num_tokens": 80076174.0, + "step": 2197 + }, + { + "epoch": 0.40817084493964717, + "grad_norm": 1.3935290575027466, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8671026229858398, + "num_tokens": 80119286.0, + "step": 2198 + }, + { + "epoch": 0.4083565459610028, + "grad_norm": 1.616757869720459, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8655588626861572, + "num_tokens": 80153643.0, + "step": 2199 + }, + { + "epoch": 0.4085422469823584, + "grad_norm": 1.4293700456619263, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.879533052444458, + "num_tokens": 80195046.0, + "step": 2200 + }, + { + "epoch": 0.40872794800371404, + "grad_norm": 1.4594758749008179, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8678586483001709, + "num_tokens": 80232759.0, + "step": 2201 + }, + { + "epoch": 0.40891364902506966, + "grad_norm": 1.548149824142456, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8642853498458862, + "num_tokens": 80268634.0, + "step": 2202 + }, + { + "epoch": 0.4090993500464253, + "grad_norm": 1.580741047859192, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8570420742034912, + "num_tokens": 80307774.0, + "step": 2203 + }, + { + "epoch": 0.40928505106778085, + "grad_norm": 1.601789951324463, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8583234548568726, + "num_tokens": 80343238.0, + "step": 2204 + }, + { + "epoch": 0.40947075208913647, + "grad_norm": 1.5896354913711548, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8762295842170715, + "num_tokens": 80375377.0, + "step": 2205 + }, + { + "epoch": 0.4096564531104921, + "grad_norm": 1.6208277940750122, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8552073836326599, + "num_tokens": 80409972.0, + "step": 2206 + }, + { + "epoch": 0.4098421541318477, + "grad_norm": 1.6563055515289307, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8674571514129639, + "num_tokens": 80445587.0, + "step": 2207 + }, + { + "epoch": 0.41002785515320334, + "grad_norm": 1.5857605934143066, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8583603501319885, + "num_tokens": 80479405.0, + "step": 2208 + }, + { + "epoch": 0.41021355617455896, + "grad_norm": 1.6773388385772705, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8572232127189636, + "num_tokens": 80513047.0, + "step": 2209 + }, + { + "epoch": 0.4103992571959146, + "grad_norm": 1.4288078546524048, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8755829930305481, + "num_tokens": 80551067.0, + "step": 2210 + }, + { + "epoch": 0.4105849582172702, + "grad_norm": 1.4699472188949585, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8655704259872437, + "num_tokens": 80591949.0, + "step": 2211 + }, + { + "epoch": 0.4107706592386258, + "grad_norm": 1.5311607122421265, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8515410423278809, + "num_tokens": 80629200.0, + "step": 2212 + }, + { + "epoch": 0.41095636025998145, + "grad_norm": 1.3952457904815674, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8740167021751404, + "num_tokens": 80672823.0, + "step": 2213 + }, + { + "epoch": 0.41114206128133707, + "grad_norm": 1.4810696840286255, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8661389350891113, + "num_tokens": 80711051.0, + "step": 2214 + }, + { + "epoch": 0.4113277623026927, + "grad_norm": 1.4231916666030884, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8560395240783691, + "num_tokens": 80751829.0, + "step": 2215 + }, + { + "epoch": 0.41151346332404826, + "grad_norm": 1.590462565422058, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8543776869773865, + "num_tokens": 80789551.0, + "step": 2216 + }, + { + "epoch": 0.4116991643454039, + "grad_norm": 1.4485849142074585, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8687611818313599, + "num_tokens": 80826816.0, + "step": 2217 + }, + { + "epoch": 0.4118848653667595, + "grad_norm": 1.4317426681518555, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8713301420211792, + "num_tokens": 80866407.0, + "step": 2218 + }, + { + "epoch": 0.4120705663881151, + "grad_norm": 1.7997255325317383, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8530135154724121, + "num_tokens": 80896464.0, + "step": 2219 + }, + { + "epoch": 0.41225626740947074, + "grad_norm": 1.5984394550323486, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8668295741081238, + "num_tokens": 80931884.0, + "step": 2220 + }, + { + "epoch": 0.41244196843082637, + "grad_norm": 1.456067681312561, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.855623722076416, + "num_tokens": 80972943.0, + "step": 2221 + }, + { + "epoch": 0.412627669452182, + "grad_norm": 1.5860322713851929, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8631840944290161, + "num_tokens": 81006527.0, + "step": 2222 + }, + { + "epoch": 0.4128133704735376, + "grad_norm": 1.7070949077606201, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8584384918212891, + "num_tokens": 81040241.0, + "step": 2223 + }, + { + "epoch": 0.41299907149489323, + "grad_norm": 1.3926810026168823, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8730964660644531, + "num_tokens": 81084871.0, + "step": 2224 + }, + { + "epoch": 0.41318477251624885, + "grad_norm": 1.6560413837432861, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8537886142730713, + "num_tokens": 81120161.0, + "step": 2225 + }, + { + "epoch": 0.4133704735376045, + "grad_norm": 1.7879283428192139, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.837639331817627, + "num_tokens": 81152897.0, + "step": 2226 + }, + { + "epoch": 0.4135561745589601, + "grad_norm": 1.5049275159835815, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8770486116409302, + "num_tokens": 81189976.0, + "step": 2227 + }, + { + "epoch": 0.41374187558031567, + "grad_norm": 1.6348682641983032, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8456566333770752, + "num_tokens": 81225707.0, + "step": 2228 + }, + { + "epoch": 0.4139275766016713, + "grad_norm": 1.5363231897354126, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8602802157402039, + "num_tokens": 81261026.0, + "step": 2229 + }, + { + "epoch": 0.4141132776230269, + "grad_norm": 1.6065914630889893, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8481050133705139, + "num_tokens": 81295095.0, + "step": 2230 + }, + { + "epoch": 0.41429897864438253, + "grad_norm": 1.5409188270568848, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8495914936065674, + "num_tokens": 81338643.0, + "step": 2231 + }, + { + "epoch": 0.41448467966573815, + "grad_norm": 1.7419229745864868, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8526694178581238, + "num_tokens": 81369426.0, + "step": 2232 + }, + { + "epoch": 0.4146703806870938, + "grad_norm": 1.6601709127426147, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8631064295768738, + "num_tokens": 81407488.0, + "step": 2233 + }, + { + "epoch": 0.4148560817084494, + "grad_norm": 1.5260732173919678, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8568745851516724, + "num_tokens": 81445564.0, + "step": 2234 + }, + { + "epoch": 0.415041782729805, + "grad_norm": 1.7240111827850342, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.867717444896698, + "num_tokens": 81475163.0, + "step": 2235 + }, + { + "epoch": 0.41522748375116064, + "grad_norm": 1.6574629545211792, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.869137167930603, + "num_tokens": 81507916.0, + "step": 2236 + }, + { + "epoch": 0.41541318477251626, + "grad_norm": 1.8840281963348389, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8617510795593262, + "num_tokens": 81538802.0, + "step": 2237 + }, + { + "epoch": 0.4155988857938719, + "grad_norm": 1.5666359663009644, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8502496480941772, + "num_tokens": 81577307.0, + "step": 2238 + }, + { + "epoch": 0.4157845868152275, + "grad_norm": 1.7353070974349976, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8732941150665283, + "num_tokens": 81609498.0, + "step": 2239 + }, + { + "epoch": 0.4159702878365831, + "grad_norm": 1.615096926689148, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8515642285346985, + "num_tokens": 81646222.0, + "step": 2240 + }, + { + "epoch": 0.4161559888579387, + "grad_norm": 1.4807465076446533, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8684728145599365, + "num_tokens": 81683880.0, + "step": 2241 + }, + { + "epoch": 0.4163416898792943, + "grad_norm": 1.6588001251220703, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8626396059989929, + "num_tokens": 81719286.0, + "step": 2242 + }, + { + "epoch": 0.41652739090064994, + "grad_norm": 1.620285987854004, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8563733100891113, + "num_tokens": 81753917.0, + "step": 2243 + }, + { + "epoch": 0.41671309192200556, + "grad_norm": 1.5662847757339478, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8686867356300354, + "num_tokens": 81789482.0, + "step": 2244 + }, + { + "epoch": 0.4168987929433612, + "grad_norm": 1.573455572128296, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8543908596038818, + "num_tokens": 81826305.0, + "step": 2245 + }, + { + "epoch": 0.4170844939647168, + "grad_norm": 1.6387380361557007, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8596856594085693, + "num_tokens": 81858635.0, + "step": 2246 + }, + { + "epoch": 0.41727019498607243, + "grad_norm": 1.636579155921936, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8676958084106445, + "num_tokens": 81891240.0, + "step": 2247 + }, + { + "epoch": 0.41745589600742805, + "grad_norm": 1.4798551797866821, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8650528192520142, + "num_tokens": 81929855.0, + "step": 2248 + }, + { + "epoch": 0.4176415970287837, + "grad_norm": 1.5844593048095703, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8657652139663696, + "num_tokens": 81961943.0, + "step": 2249 + }, + { + "epoch": 0.4178272980501393, + "grad_norm": 1.5459848642349243, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8665017485618591, + "num_tokens": 81998353.0, + "step": 2250 + }, + { + "epoch": 0.4180129990714949, + "grad_norm": 1.594954490661621, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8631393313407898, + "num_tokens": 82035716.0, + "step": 2251 + }, + { + "epoch": 0.4181987000928505, + "grad_norm": 1.6946039199829102, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8490841388702393, + "num_tokens": 82072947.0, + "step": 2252 + }, + { + "epoch": 0.4183844011142061, + "grad_norm": 1.4906734228134155, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8625375032424927, + "num_tokens": 82112480.0, + "step": 2253 + }, + { + "epoch": 0.41857010213556173, + "grad_norm": 1.5435630083084106, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8534538745880127, + "num_tokens": 82148898.0, + "step": 2254 + }, + { + "epoch": 0.41875580315691735, + "grad_norm": 1.5805425643920898, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8767073154449463, + "num_tokens": 82181268.0, + "step": 2255 + }, + { + "epoch": 0.41894150417827297, + "grad_norm": 1.596311330795288, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8697183132171631, + "num_tokens": 82216743.0, + "step": 2256 + }, + { + "epoch": 0.4191272051996286, + "grad_norm": 1.8757023811340332, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8555082082748413, + "num_tokens": 82244901.0, + "step": 2257 + }, + { + "epoch": 0.4193129062209842, + "grad_norm": 1.6237624883651733, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.875980794429779, + "num_tokens": 82278287.0, + "step": 2258 + }, + { + "epoch": 0.41949860724233984, + "grad_norm": 1.5119271278381348, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8576265573501587, + "num_tokens": 82318830.0, + "step": 2259 + }, + { + "epoch": 0.41968430826369546, + "grad_norm": 1.466933250427246, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8613604307174683, + "num_tokens": 82357727.0, + "step": 2260 + }, + { + "epoch": 0.4198700092850511, + "grad_norm": 1.63230299949646, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.867504894733429, + "num_tokens": 82391712.0, + "step": 2261 + }, + { + "epoch": 0.4200557103064067, + "grad_norm": 1.7708038091659546, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8645693063735962, + "num_tokens": 82424680.0, + "step": 2262 + }, + { + "epoch": 0.4202414113277623, + "grad_norm": 1.6339610815048218, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8500587940216064, + "num_tokens": 82457778.0, + "step": 2263 + }, + { + "epoch": 0.4204271123491179, + "grad_norm": 1.5283896923065186, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8674308061599731, + "num_tokens": 82495120.0, + "step": 2264 + }, + { + "epoch": 0.4206128133704735, + "grad_norm": 1.5902564525604248, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.856529712677002, + "num_tokens": 82531869.0, + "step": 2265 + }, + { + "epoch": 0.42079851439182914, + "grad_norm": 1.4461727142333984, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8729022741317749, + "num_tokens": 82569376.0, + "step": 2266 + }, + { + "epoch": 0.42098421541318476, + "grad_norm": 1.620163083076477, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8728333711624146, + "num_tokens": 82601150.0, + "step": 2267 + }, + { + "epoch": 0.4211699164345404, + "grad_norm": 1.6019706726074219, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8634229898452759, + "num_tokens": 82633960.0, + "step": 2268 + }, + { + "epoch": 0.421355617455896, + "grad_norm": 1.6671384572982788, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8475171327590942, + "num_tokens": 82663467.0, + "step": 2269 + }, + { + "epoch": 0.4215413184772516, + "grad_norm": 1.7063013315200806, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8509583473205566, + "num_tokens": 82696000.0, + "step": 2270 + }, + { + "epoch": 0.42172701949860725, + "grad_norm": 1.5395835638046265, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8646456003189087, + "num_tokens": 82730890.0, + "step": 2271 + }, + { + "epoch": 0.42191272051996287, + "grad_norm": 1.4155422449111938, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8658815622329712, + "num_tokens": 82773617.0, + "step": 2272 + }, + { + "epoch": 0.4220984215413185, + "grad_norm": 1.564563274383545, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8644291162490845, + "num_tokens": 82808217.0, + "step": 2273 + }, + { + "epoch": 0.4222841225626741, + "grad_norm": 1.6313940286636353, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.861163318157196, + "num_tokens": 82844176.0, + "step": 2274 + }, + { + "epoch": 0.42246982358402974, + "grad_norm": 1.4281975030899048, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.869805097579956, + "num_tokens": 82887454.0, + "step": 2275 + }, + { + "epoch": 0.4226555246053853, + "grad_norm": 1.6591018438339233, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8528494834899902, + "num_tokens": 82921925.0, + "step": 2276 + }, + { + "epoch": 0.4228412256267409, + "grad_norm": 1.5937087535858154, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8509423732757568, + "num_tokens": 82957670.0, + "step": 2277 + }, + { + "epoch": 0.42302692664809655, + "grad_norm": 1.68441903591156, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8489738702774048, + "num_tokens": 82993027.0, + "step": 2278 + }, + { + "epoch": 0.42321262766945217, + "grad_norm": 1.440710186958313, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8698835372924805, + "num_tokens": 83031781.0, + "step": 2279 + }, + { + "epoch": 0.4233983286908078, + "grad_norm": 1.5241490602493286, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8651667833328247, + "num_tokens": 83068604.0, + "step": 2280 + }, + { + "epoch": 0.4235840297121634, + "grad_norm": 1.590360164642334, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8501961827278137, + "num_tokens": 83103853.0, + "step": 2281 + }, + { + "epoch": 0.42376973073351903, + "grad_norm": 1.5902111530303955, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8422383069992065, + "num_tokens": 83140917.0, + "step": 2282 + }, + { + "epoch": 0.42395543175487466, + "grad_norm": 1.5050674676895142, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8575906157493591, + "num_tokens": 83176410.0, + "step": 2283 + }, + { + "epoch": 0.4241411327762303, + "grad_norm": 1.5555611848831177, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8697945475578308, + "num_tokens": 83210612.0, + "step": 2284 + }, + { + "epoch": 0.4243268337975859, + "grad_norm": 1.6037195920944214, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8602460622787476, + "num_tokens": 83247358.0, + "step": 2285 + }, + { + "epoch": 0.4245125348189415, + "grad_norm": 1.4739688634872437, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8693487644195557, + "num_tokens": 83286716.0, + "step": 2286 + }, + { + "epoch": 0.42469823584029714, + "grad_norm": 1.5871943235397339, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8603805303573608, + "num_tokens": 83322893.0, + "step": 2287 + }, + { + "epoch": 0.42488393686165277, + "grad_norm": 1.382516860961914, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8725132346153259, + "num_tokens": 83360436.0, + "step": 2288 + }, + { + "epoch": 0.42506963788300833, + "grad_norm": 1.5094250440597534, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8709606528282166, + "num_tokens": 83396100.0, + "step": 2289 + }, + { + "epoch": 0.42525533890436396, + "grad_norm": 1.49933922290802, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8652180433273315, + "num_tokens": 83434679.0, + "step": 2290 + }, + { + "epoch": 0.4254410399257196, + "grad_norm": 1.415177583694458, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8585096001625061, + "num_tokens": 83476789.0, + "step": 2291 + }, + { + "epoch": 0.4256267409470752, + "grad_norm": 1.618212342262268, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8629293441772461, + "num_tokens": 83509846.0, + "step": 2292 + }, + { + "epoch": 0.4258124419684308, + "grad_norm": 1.6416999101638794, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8757011890411377, + "num_tokens": 83541258.0, + "step": 2293 + }, + { + "epoch": 0.42599814298978644, + "grad_norm": 1.6909329891204834, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8594231605529785, + "num_tokens": 83571410.0, + "step": 2294 + }, + { + "epoch": 0.42618384401114207, + "grad_norm": 1.7957992553710938, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8531836271286011, + "num_tokens": 83604141.0, + "step": 2295 + }, + { + "epoch": 0.4263695450324977, + "grad_norm": 1.5127307176589966, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8745315670967102, + "num_tokens": 83642043.0, + "step": 2296 + }, + { + "epoch": 0.4265552460538533, + "grad_norm": 1.701076626777649, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8647662997245789, + "num_tokens": 83674536.0, + "step": 2297 + }, + { + "epoch": 0.42674094707520893, + "grad_norm": 1.5053714513778687, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8647233843803406, + "num_tokens": 83710234.0, + "step": 2298 + }, + { + "epoch": 0.42692664809656455, + "grad_norm": 1.8186085224151611, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.867171049118042, + "num_tokens": 83738441.0, + "step": 2299 + }, + { + "epoch": 0.4271123491179202, + "grad_norm": 1.5807279348373413, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8792181611061096, + "num_tokens": 83775473.0, + "step": 2300 + }, + { + "epoch": 0.42729805013927574, + "grad_norm": 1.6295591592788696, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.864111065864563, + "num_tokens": 83809012.0, + "step": 2301 + }, + { + "epoch": 0.42748375116063136, + "grad_norm": 1.569127082824707, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8488407135009766, + "num_tokens": 83846584.0, + "step": 2302 + }, + { + "epoch": 0.427669452181987, + "grad_norm": 1.5939300060272217, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8470572233200073, + "num_tokens": 83881749.0, + "step": 2303 + }, + { + "epoch": 0.4278551532033426, + "grad_norm": 1.8245395421981812, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8701677322387695, + "num_tokens": 83911364.0, + "step": 2304 + }, + { + "epoch": 0.42804085422469823, + "grad_norm": 1.6775137186050415, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8356692790985107, + "num_tokens": 83945017.0, + "step": 2305 + }, + { + "epoch": 0.42822655524605385, + "grad_norm": 1.5092891454696655, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8670701384544373, + "num_tokens": 83984006.0, + "step": 2306 + }, + { + "epoch": 0.4284122562674095, + "grad_norm": 1.5750876665115356, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8357741832733154, + "num_tokens": 84023984.0, + "step": 2307 + }, + { + "epoch": 0.4285979572887651, + "grad_norm": 1.477388858795166, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8577358722686768, + "num_tokens": 84065158.0, + "step": 2308 + }, + { + "epoch": 0.4287836583101207, + "grad_norm": 1.6835942268371582, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8609405755996704, + "num_tokens": 84097722.0, + "step": 2309 + }, + { + "epoch": 0.42896935933147634, + "grad_norm": 1.4580024480819702, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8665608167648315, + "num_tokens": 84135125.0, + "step": 2310 + }, + { + "epoch": 0.42915506035283196, + "grad_norm": 1.5402281284332275, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8712745904922485, + "num_tokens": 84172607.0, + "step": 2311 + }, + { + "epoch": 0.4293407613741876, + "grad_norm": 1.6294686794281006, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8585946559906006, + "num_tokens": 84209674.0, + "step": 2312 + }, + { + "epoch": 0.42952646239554315, + "grad_norm": 1.489168643951416, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8668532371520996, + "num_tokens": 84245494.0, + "step": 2313 + }, + { + "epoch": 0.4297121634168988, + "grad_norm": 1.641421914100647, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8587250709533691, + "num_tokens": 84280355.0, + "step": 2314 + }, + { + "epoch": 0.4298978644382544, + "grad_norm": 1.5993824005126953, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8543604612350464, + "num_tokens": 84320452.0, + "step": 2315 + }, + { + "epoch": 0.43008356545961, + "grad_norm": 1.571726679801941, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.861693263053894, + "num_tokens": 84357534.0, + "step": 2316 + }, + { + "epoch": 0.43026926648096564, + "grad_norm": 1.4871162176132202, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8709137439727783, + "num_tokens": 84395453.0, + "step": 2317 + }, + { + "epoch": 0.43045496750232126, + "grad_norm": 1.613318920135498, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.844245433807373, + "num_tokens": 84433553.0, + "step": 2318 + }, + { + "epoch": 0.4306406685236769, + "grad_norm": 1.5949034690856934, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8557240962982178, + "num_tokens": 84465278.0, + "step": 2319 + }, + { + "epoch": 0.4308263695450325, + "grad_norm": 1.505773663520813, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8666181564331055, + "num_tokens": 84502681.0, + "step": 2320 + }, + { + "epoch": 0.43101207056638813, + "grad_norm": 1.6635417938232422, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8691294193267822, + "num_tokens": 84533965.0, + "step": 2321 + }, + { + "epoch": 0.43119777158774375, + "grad_norm": 1.5364601612091064, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8471032977104187, + "num_tokens": 84570554.0, + "step": 2322 + }, + { + "epoch": 0.43138347260909937, + "grad_norm": 1.6038789749145508, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8741505146026611, + "num_tokens": 84606110.0, + "step": 2323 + }, + { + "epoch": 0.431569173630455, + "grad_norm": 1.3938137292861938, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8702752590179443, + "num_tokens": 84647662.0, + "step": 2324 + }, + { + "epoch": 0.43175487465181056, + "grad_norm": 1.6352148056030273, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8466811180114746, + "num_tokens": 84680446.0, + "step": 2325 + }, + { + "epoch": 0.4319405756731662, + "grad_norm": 1.4816734790802002, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8548471927642822, + "num_tokens": 84721063.0, + "step": 2326 + }, + { + "epoch": 0.4321262766945218, + "grad_norm": 1.55105721950531, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8617140650749207, + "num_tokens": 84757358.0, + "step": 2327 + }, + { + "epoch": 0.4323119777158774, + "grad_norm": 1.603448510169983, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.866193950176239, + "num_tokens": 84790258.0, + "step": 2328 + }, + { + "epoch": 0.43249767873723305, + "grad_norm": 1.3972009420394897, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.854322612285614, + "num_tokens": 84835663.0, + "step": 2329 + }, + { + "epoch": 0.43268337975858867, + "grad_norm": 1.665804386138916, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8632634282112122, + "num_tokens": 84866999.0, + "step": 2330 + }, + { + "epoch": 0.4328690807799443, + "grad_norm": 1.49362051486969, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8524905443191528, + "num_tokens": 84906018.0, + "step": 2331 + }, + { + "epoch": 0.4330547818012999, + "grad_norm": 1.3513410091400146, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8779913187026978, + "num_tokens": 84947414.0, + "step": 2332 + }, + { + "epoch": 0.43324048282265554, + "grad_norm": 1.4970920085906982, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8654659986495972, + "num_tokens": 84985254.0, + "step": 2333 + }, + { + "epoch": 0.43342618384401116, + "grad_norm": 1.5525802373886108, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8528021574020386, + "num_tokens": 85021225.0, + "step": 2334 + }, + { + "epoch": 0.4336118848653668, + "grad_norm": 1.6917519569396973, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8634958863258362, + "num_tokens": 85053643.0, + "step": 2335 + }, + { + "epoch": 0.4337975858867224, + "grad_norm": 1.620306372642517, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8640002012252808, + "num_tokens": 85084716.0, + "step": 2336 + }, + { + "epoch": 0.43398328690807797, + "grad_norm": 1.5823689699172974, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8654273748397827, + "num_tokens": 85121703.0, + "step": 2337 + }, + { + "epoch": 0.4341689879294336, + "grad_norm": 1.4869390726089478, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8775213956832886, + "num_tokens": 85159403.0, + "step": 2338 + }, + { + "epoch": 0.4343546889507892, + "grad_norm": 1.4174683094024658, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8721193671226501, + "num_tokens": 85199398.0, + "step": 2339 + }, + { + "epoch": 0.43454038997214484, + "grad_norm": 1.7031830549240112, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8333107233047485, + "num_tokens": 85233254.0, + "step": 2340 + }, + { + "epoch": 0.43472609099350046, + "grad_norm": 1.3832463026046753, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8643491268157959, + "num_tokens": 85276948.0, + "step": 2341 + }, + { + "epoch": 0.4349117920148561, + "grad_norm": 1.555324673652649, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8518953323364258, + "num_tokens": 85315321.0, + "step": 2342 + }, + { + "epoch": 0.4350974930362117, + "grad_norm": 1.5243955850601196, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8576712608337402, + "num_tokens": 85352324.0, + "step": 2343 + }, + { + "epoch": 0.4352831940575673, + "grad_norm": 1.86097252368927, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8435202240943909, + "num_tokens": 85380493.0, + "step": 2344 + }, + { + "epoch": 0.43546889507892295, + "grad_norm": 1.5182569026947021, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8594777584075928, + "num_tokens": 85419277.0, + "step": 2345 + }, + { + "epoch": 0.43565459610027857, + "grad_norm": 1.6217926740646362, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8778883218765259, + "num_tokens": 85451313.0, + "step": 2346 + }, + { + "epoch": 0.4358402971216342, + "grad_norm": 1.676342248916626, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8722585439682007, + "num_tokens": 85484728.0, + "step": 2347 + }, + { + "epoch": 0.4360259981429898, + "grad_norm": 1.4148889780044556, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8707573413848877, + "num_tokens": 85526824.0, + "step": 2348 + }, + { + "epoch": 0.4362116991643454, + "grad_norm": 1.644508719444275, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8693531155586243, + "num_tokens": 85559856.0, + "step": 2349 + }, + { + "epoch": 0.436397400185701, + "grad_norm": 1.7412750720977783, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8512993454933167, + "num_tokens": 85595469.0, + "step": 2350 + }, + { + "epoch": 0.4365831012070566, + "grad_norm": 1.5335923433303833, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8632089495658875, + "num_tokens": 85632628.0, + "step": 2351 + }, + { + "epoch": 0.43676880222841225, + "grad_norm": 1.7336359024047852, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8689530491828918, + "num_tokens": 85662998.0, + "step": 2352 + }, + { + "epoch": 0.43695450324976787, + "grad_norm": 1.6256940364837646, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8559242486953735, + "num_tokens": 85699421.0, + "step": 2353 + }, + { + "epoch": 0.4371402042711235, + "grad_norm": 1.5865020751953125, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8528822064399719, + "num_tokens": 85735840.0, + "step": 2354 + }, + { + "epoch": 0.4373259052924791, + "grad_norm": 1.5955132246017456, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.866531252861023, + "num_tokens": 85771848.0, + "step": 2355 + }, + { + "epoch": 0.43751160631383473, + "grad_norm": 1.5517808198928833, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8692068457603455, + "num_tokens": 85810737.0, + "step": 2356 + }, + { + "epoch": 0.43769730733519036, + "grad_norm": 1.4975812435150146, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.870405912399292, + "num_tokens": 85846164.0, + "step": 2357 + }, + { + "epoch": 0.437883008356546, + "grad_norm": 1.4540857076644897, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8697662353515625, + "num_tokens": 85886582.0, + "step": 2358 + }, + { + "epoch": 0.4380687093779016, + "grad_norm": 1.538757562637329, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8530256152153015, + "num_tokens": 85925398.0, + "step": 2359 + }, + { + "epoch": 0.4382544103992572, + "grad_norm": 1.5294280052185059, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8469113111495972, + "num_tokens": 85966385.0, + "step": 2360 + }, + { + "epoch": 0.4384401114206128, + "grad_norm": 1.4443734884262085, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8751398921012878, + "num_tokens": 86002377.0, + "step": 2361 + }, + { + "epoch": 0.4386258124419684, + "grad_norm": 1.618979573249817, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8659478425979614, + "num_tokens": 86036168.0, + "step": 2362 + }, + { + "epoch": 0.43881151346332403, + "grad_norm": 1.524111270904541, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.866141676902771, + "num_tokens": 86069926.0, + "step": 2363 + }, + { + "epoch": 0.43899721448467965, + "grad_norm": 1.6723012924194336, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.856911301612854, + "num_tokens": 86101681.0, + "step": 2364 + }, + { + "epoch": 0.4391829155060353, + "grad_norm": 1.6709086894989014, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8681044578552246, + "num_tokens": 86136136.0, + "step": 2365 + }, + { + "epoch": 0.4393686165273909, + "grad_norm": 1.6470106840133667, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.857628345489502, + "num_tokens": 86167994.0, + "step": 2366 + }, + { + "epoch": 0.4395543175487465, + "grad_norm": 1.6597113609313965, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8643330335617065, + "num_tokens": 86200943.0, + "step": 2367 + }, + { + "epoch": 0.43974001857010214, + "grad_norm": 1.5606118440628052, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8568173050880432, + "num_tokens": 86239874.0, + "step": 2368 + }, + { + "epoch": 0.43992571959145776, + "grad_norm": 1.576084017753601, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8587081432342529, + "num_tokens": 86278203.0, + "step": 2369 + }, + { + "epoch": 0.4401114206128134, + "grad_norm": 1.5645512342453003, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8689473867416382, + "num_tokens": 86311433.0, + "step": 2370 + }, + { + "epoch": 0.440297121634169, + "grad_norm": 1.5847842693328857, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8428810834884644, + "num_tokens": 86351755.0, + "step": 2371 + }, + { + "epoch": 0.44048282265552463, + "grad_norm": 1.681885838508606, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8463761806488037, + "num_tokens": 86387942.0, + "step": 2372 + }, + { + "epoch": 0.4406685236768802, + "grad_norm": 1.5428946018218994, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8631237149238586, + "num_tokens": 86423866.0, + "step": 2373 + }, + { + "epoch": 0.4408542246982358, + "grad_norm": 1.3937088251113892, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.878483772277832, + "num_tokens": 86465732.0, + "step": 2374 + }, + { + "epoch": 0.44103992571959144, + "grad_norm": 1.5961953401565552, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8612908124923706, + "num_tokens": 86499774.0, + "step": 2375 + }, + { + "epoch": 0.44122562674094706, + "grad_norm": 1.7433699369430542, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8489718437194824, + "num_tokens": 86531323.0, + "step": 2376 + }, + { + "epoch": 0.4414113277623027, + "grad_norm": 1.7174899578094482, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8405729532241821, + "num_tokens": 86569802.0, + "step": 2377 + }, + { + "epoch": 0.4415970287836583, + "grad_norm": 1.5976994037628174, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8620120882987976, + "num_tokens": 86607802.0, + "step": 2378 + }, + { + "epoch": 0.44178272980501393, + "grad_norm": 1.4611992835998535, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8655914068222046, + "num_tokens": 86643795.0, + "step": 2379 + }, + { + "epoch": 0.44196843082636955, + "grad_norm": 1.5195207595825195, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8625285625457764, + "num_tokens": 86678193.0, + "step": 2380 + }, + { + "epoch": 0.4421541318477252, + "grad_norm": 1.622690200805664, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.869947612285614, + "num_tokens": 86712078.0, + "step": 2381 + }, + { + "epoch": 0.4423398328690808, + "grad_norm": 1.4508496522903442, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8758548498153687, + "num_tokens": 86751753.0, + "step": 2382 + }, + { + "epoch": 0.4425255338904364, + "grad_norm": 1.4814635515213013, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8577752709388733, + "num_tokens": 86792186.0, + "step": 2383 + }, + { + "epoch": 0.44271123491179204, + "grad_norm": 1.6127458810806274, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8571141958236694, + "num_tokens": 86826480.0, + "step": 2384 + }, + { + "epoch": 0.4428969359331476, + "grad_norm": 1.5944111347198486, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8639391660690308, + "num_tokens": 86860508.0, + "step": 2385 + }, + { + "epoch": 0.44308263695450323, + "grad_norm": 1.5908136367797852, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8585290908813477, + "num_tokens": 86893190.0, + "step": 2386 + }, + { + "epoch": 0.44326833797585885, + "grad_norm": 1.611554503440857, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8506274223327637, + "num_tokens": 86929124.0, + "step": 2387 + }, + { + "epoch": 0.4434540389972145, + "grad_norm": 1.5688210725784302, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.846028745174408, + "num_tokens": 86970419.0, + "step": 2388 + }, + { + "epoch": 0.4436397400185701, + "grad_norm": 1.388094425201416, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8569289445877075, + "num_tokens": 87015203.0, + "step": 2389 + }, + { + "epoch": 0.4438254410399257, + "grad_norm": 1.5419559478759766, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8739938735961914, + "num_tokens": 87047573.0, + "step": 2390 + }, + { + "epoch": 0.44401114206128134, + "grad_norm": 1.465908408164978, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8629828095436096, + "num_tokens": 87084514.0, + "step": 2391 + }, + { + "epoch": 0.44419684308263696, + "grad_norm": 1.3704537153244019, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8639065027236938, + "num_tokens": 87128965.0, + "step": 2392 + }, + { + "epoch": 0.4443825441039926, + "grad_norm": 1.7255455255508423, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8556462526321411, + "num_tokens": 87159287.0, + "step": 2393 + }, + { + "epoch": 0.4445682451253482, + "grad_norm": 1.5029200315475464, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8673929572105408, + "num_tokens": 87198325.0, + "step": 2394 + }, + { + "epoch": 0.4447539461467038, + "grad_norm": 1.4792251586914062, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8733958005905151, + "num_tokens": 87233580.0, + "step": 2395 + }, + { + "epoch": 0.44493964716805945, + "grad_norm": 1.6342599391937256, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8687189817428589, + "num_tokens": 87270393.0, + "step": 2396 + }, + { + "epoch": 0.445125348189415, + "grad_norm": 1.6649657487869263, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8684987425804138, + "num_tokens": 87301929.0, + "step": 2397 + }, + { + "epoch": 0.44531104921077064, + "grad_norm": 1.6273466348648071, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8713333606719971, + "num_tokens": 87332658.0, + "step": 2398 + }, + { + "epoch": 0.44549675023212626, + "grad_norm": 1.5821789503097534, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8675704598426819, + "num_tokens": 87364123.0, + "step": 2399 + }, + { + "epoch": 0.4456824512534819, + "grad_norm": 1.4491170644760132, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8775043487548828, + "num_tokens": 87401308.0, + "step": 2400 + }, + { + "epoch": 0.4458681522748375, + "grad_norm": 1.620055913925171, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8559287786483765, + "num_tokens": 87434982.0, + "step": 2401 + }, + { + "epoch": 0.4460538532961931, + "grad_norm": 1.6492801904678345, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8627369999885559, + "num_tokens": 87468235.0, + "step": 2402 + }, + { + "epoch": 0.44623955431754875, + "grad_norm": 1.6950056552886963, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8310226202011108, + "num_tokens": 87505015.0, + "step": 2403 + }, + { + "epoch": 0.44642525533890437, + "grad_norm": 1.650247573852539, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8516016006469727, + "num_tokens": 87541774.0, + "step": 2404 + }, + { + "epoch": 0.44661095636026, + "grad_norm": 1.5520542860031128, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8589296340942383, + "num_tokens": 87577455.0, + "step": 2405 + }, + { + "epoch": 0.4467966573816156, + "grad_norm": 1.6333900690078735, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8606515526771545, + "num_tokens": 87613115.0, + "step": 2406 + }, + { + "epoch": 0.44698235840297124, + "grad_norm": 1.5300929546356201, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8370774984359741, + "num_tokens": 87655251.0, + "step": 2407 + }, + { + "epoch": 0.44716805942432686, + "grad_norm": 1.484673023223877, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8456854820251465, + "num_tokens": 87697887.0, + "step": 2408 + }, + { + "epoch": 0.4473537604456824, + "grad_norm": 1.5489364862442017, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8714519143104553, + "num_tokens": 87730290.0, + "step": 2409 + }, + { + "epoch": 0.44753946146703805, + "grad_norm": 1.4791743755340576, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8596043586730957, + "num_tokens": 87769604.0, + "step": 2410 + }, + { + "epoch": 0.44772516248839367, + "grad_norm": 1.6490797996520996, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8500655889511108, + "num_tokens": 87805210.0, + "step": 2411 + }, + { + "epoch": 0.4479108635097493, + "grad_norm": 1.4230343103408813, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8734414577484131, + "num_tokens": 87843921.0, + "step": 2412 + }, + { + "epoch": 0.4480965645311049, + "grad_norm": 1.507856845855713, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8598453402519226, + "num_tokens": 87882516.0, + "step": 2413 + }, + { + "epoch": 0.44828226555246053, + "grad_norm": 1.503535509109497, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8611406087875366, + "num_tokens": 87919293.0, + "step": 2414 + }, + { + "epoch": 0.44846796657381616, + "grad_norm": 1.4011931419372559, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.859448254108429, + "num_tokens": 87961112.0, + "step": 2415 + }, + { + "epoch": 0.4486536675951718, + "grad_norm": 1.4403642416000366, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8740181922912598, + "num_tokens": 88001763.0, + "step": 2416 + }, + { + "epoch": 0.4488393686165274, + "grad_norm": 1.6095882654190063, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8764657974243164, + "num_tokens": 88036842.0, + "step": 2417 + }, + { + "epoch": 0.449025069637883, + "grad_norm": 1.6001380681991577, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.853648841381073, + "num_tokens": 88074198.0, + "step": 2418 + }, + { + "epoch": 0.44921077065923865, + "grad_norm": 1.6759918928146362, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8537457585334778, + "num_tokens": 88109478.0, + "step": 2419 + }, + { + "epoch": 0.44939647168059427, + "grad_norm": 1.5828723907470703, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8700623512268066, + "num_tokens": 88141190.0, + "step": 2420 + }, + { + "epoch": 0.44958217270194983, + "grad_norm": 1.7752196788787842, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8291153907775879, + "num_tokens": 88171666.0, + "step": 2421 + }, + { + "epoch": 0.44976787372330546, + "grad_norm": 1.5991629362106323, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8618968725204468, + "num_tokens": 88204708.0, + "step": 2422 + }, + { + "epoch": 0.4499535747446611, + "grad_norm": 1.5492757558822632, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8466116189956665, + "num_tokens": 88245284.0, + "step": 2423 + }, + { + "epoch": 0.4501392757660167, + "grad_norm": 1.611425518989563, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8523756265640259, + "num_tokens": 88283924.0, + "step": 2424 + }, + { + "epoch": 0.4503249767873723, + "grad_norm": 1.6658730506896973, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8556996583938599, + "num_tokens": 88317082.0, + "step": 2425 + }, + { + "epoch": 0.45051067780872794, + "grad_norm": 1.5042372941970825, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8656793832778931, + "num_tokens": 88355159.0, + "step": 2426 + }, + { + "epoch": 0.45069637883008357, + "grad_norm": 1.6567456722259521, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8541801571846008, + "num_tokens": 88385626.0, + "step": 2427 + }, + { + "epoch": 0.4508820798514392, + "grad_norm": 1.5663135051727295, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8644492626190186, + "num_tokens": 88419732.0, + "step": 2428 + }, + { + "epoch": 0.4510677808727948, + "grad_norm": 1.5576820373535156, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8454523086547852, + "num_tokens": 88461015.0, + "step": 2429 + }, + { + "epoch": 0.45125348189415043, + "grad_norm": 1.746611475944519, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8625969886779785, + "num_tokens": 88488966.0, + "step": 2430 + }, + { + "epoch": 0.45143918291550605, + "grad_norm": 1.466282606124878, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8626012802124023, + "num_tokens": 88525108.0, + "step": 2431 + }, + { + "epoch": 0.4516248839368617, + "grad_norm": 1.541893720626831, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8543695211410522, + "num_tokens": 88563371.0, + "step": 2432 + }, + { + "epoch": 0.45181058495821724, + "grad_norm": 1.5190389156341553, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8630064725875854, + "num_tokens": 88599958.0, + "step": 2433 + }, + { + "epoch": 0.45199628597957286, + "grad_norm": 1.5390135049819946, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8563204407691956, + "num_tokens": 88640034.0, + "step": 2434 + }, + { + "epoch": 0.4521819870009285, + "grad_norm": 1.6777604818344116, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8613282442092896, + "num_tokens": 88672464.0, + "step": 2435 + }, + { + "epoch": 0.4523676880222841, + "grad_norm": 1.7911900281906128, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8331278562545776, + "num_tokens": 88704858.0, + "step": 2436 + }, + { + "epoch": 0.45255338904363973, + "grad_norm": 1.5762258768081665, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8536249399185181, + "num_tokens": 88739576.0, + "step": 2437 + }, + { + "epoch": 0.45273909006499535, + "grad_norm": 1.5734955072402954, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8598054647445679, + "num_tokens": 88775132.0, + "step": 2438 + }, + { + "epoch": 0.452924791086351, + "grad_norm": 1.4992507696151733, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.855384111404419, + "num_tokens": 88816034.0, + "step": 2439 + }, + { + "epoch": 0.4531104921077066, + "grad_norm": 1.5267142057418823, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8681104183197021, + "num_tokens": 88851274.0, + "step": 2440 + }, + { + "epoch": 0.4532961931290622, + "grad_norm": 1.5420857667922974, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8629218935966492, + "num_tokens": 88885907.0, + "step": 2441 + }, + { + "epoch": 0.45348189415041784, + "grad_norm": 1.5151737928390503, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8627505302429199, + "num_tokens": 88923648.0, + "step": 2442 + }, + { + "epoch": 0.45366759517177346, + "grad_norm": 1.5853781700134277, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8634775876998901, + "num_tokens": 88957514.0, + "step": 2443 + }, + { + "epoch": 0.4538532961931291, + "grad_norm": 1.4848718643188477, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8655852675437927, + "num_tokens": 88997381.0, + "step": 2444 + }, + { + "epoch": 0.45403899721448465, + "grad_norm": 1.525117039680481, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8572736978530884, + "num_tokens": 89033310.0, + "step": 2445 + }, + { + "epoch": 0.4542246982358403, + "grad_norm": 1.6584856510162354, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8770633935928345, + "num_tokens": 89062870.0, + "step": 2446 + }, + { + "epoch": 0.4544103992571959, + "grad_norm": 1.5808607339859009, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8632139563560486, + "num_tokens": 89098202.0, + "step": 2447 + }, + { + "epoch": 0.4545961002785515, + "grad_norm": 1.5408387184143066, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8595589399337769, + "num_tokens": 89131833.0, + "step": 2448 + }, + { + "epoch": 0.45478180129990714, + "grad_norm": 1.5536994934082031, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8576943874359131, + "num_tokens": 89167885.0, + "step": 2449 + }, + { + "epoch": 0.45496750232126276, + "grad_norm": 1.537602424621582, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8564575910568237, + "num_tokens": 89204509.0, + "step": 2450 + }, + { + "epoch": 0.4551532033426184, + "grad_norm": 1.4592976570129395, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8598098158836365, + "num_tokens": 89244056.0, + "step": 2451 + }, + { + "epoch": 0.455338904363974, + "grad_norm": 1.6577671766281128, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8608088493347168, + "num_tokens": 89277459.0, + "step": 2452 + }, + { + "epoch": 0.45552460538532963, + "grad_norm": 1.5651408433914185, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8619487881660461, + "num_tokens": 89315112.0, + "step": 2453 + }, + { + "epoch": 0.45571030640668525, + "grad_norm": 1.6452587842941284, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8474476337432861, + "num_tokens": 89353424.0, + "step": 2454 + }, + { + "epoch": 0.4558960074280409, + "grad_norm": 1.602244257926941, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.856601357460022, + "num_tokens": 89390273.0, + "step": 2455 + }, + { + "epoch": 0.4560817084493965, + "grad_norm": 1.5340909957885742, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8696473836898804, + "num_tokens": 89427779.0, + "step": 2456 + }, + { + "epoch": 0.4562674094707521, + "grad_norm": 1.611655592918396, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8544287085533142, + "num_tokens": 89463928.0, + "step": 2457 + }, + { + "epoch": 0.4564531104921077, + "grad_norm": 1.5384162664413452, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8451942801475525, + "num_tokens": 89503486.0, + "step": 2458 + }, + { + "epoch": 0.4566388115134633, + "grad_norm": 1.5365142822265625, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8567966222763062, + "num_tokens": 89542701.0, + "step": 2459 + }, + { + "epoch": 0.4568245125348189, + "grad_norm": 1.512643814086914, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8689988255500793, + "num_tokens": 89584398.0, + "step": 2460 + }, + { + "epoch": 0.45701021355617455, + "grad_norm": 1.5876660346984863, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8452503085136414, + "num_tokens": 89622442.0, + "step": 2461 + }, + { + "epoch": 0.45719591457753017, + "grad_norm": 1.5214416980743408, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8457968235015869, + "num_tokens": 89661303.0, + "step": 2462 + }, + { + "epoch": 0.4573816155988858, + "grad_norm": 1.4296047687530518, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8593000769615173, + "num_tokens": 89700069.0, + "step": 2463 + }, + { + "epoch": 0.4575673166202414, + "grad_norm": 1.4108325242996216, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8774819374084473, + "num_tokens": 89736322.0, + "step": 2464 + }, + { + "epoch": 0.45775301764159704, + "grad_norm": 1.4371260404586792, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.875525712966919, + "num_tokens": 89774707.0, + "step": 2465 + }, + { + "epoch": 0.45793871866295266, + "grad_norm": 1.5049399137496948, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8736715316772461, + "num_tokens": 89814875.0, + "step": 2466 + }, + { + "epoch": 0.4581244196843083, + "grad_norm": 1.5195943117141724, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8662510514259338, + "num_tokens": 89851067.0, + "step": 2467 + }, + { + "epoch": 0.4583101207056639, + "grad_norm": 1.446682095527649, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.869819700717926, + "num_tokens": 89891074.0, + "step": 2468 + }, + { + "epoch": 0.4584958217270195, + "grad_norm": 1.5312743186950684, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8611677289009094, + "num_tokens": 89930654.0, + "step": 2469 + }, + { + "epoch": 0.4586815227483751, + "grad_norm": 1.6014106273651123, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.858281672000885, + "num_tokens": 89965876.0, + "step": 2470 + }, + { + "epoch": 0.4588672237697307, + "grad_norm": 1.5813562870025635, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8777447938919067, + "num_tokens": 89997628.0, + "step": 2471 + }, + { + "epoch": 0.45905292479108634, + "grad_norm": 1.6677274703979492, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8779006004333496, + "num_tokens": 90033110.0, + "step": 2472 + }, + { + "epoch": 0.45923862581244196, + "grad_norm": 1.5660773515701294, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.870934247970581, + "num_tokens": 90066570.0, + "step": 2473 + }, + { + "epoch": 0.4594243268337976, + "grad_norm": 1.5771547555923462, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8498554229736328, + "num_tokens": 90103663.0, + "step": 2474 + }, + { + "epoch": 0.4596100278551532, + "grad_norm": 1.5570168495178223, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8683611750602722, + "num_tokens": 90137236.0, + "step": 2475 + }, + { + "epoch": 0.4597957288765088, + "grad_norm": 1.5477654933929443, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8599579334259033, + "num_tokens": 90177265.0, + "step": 2476 + }, + { + "epoch": 0.45998142989786445, + "grad_norm": 1.5362951755523682, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8591945767402649, + "num_tokens": 90214919.0, + "step": 2477 + }, + { + "epoch": 0.46016713091922007, + "grad_norm": 1.5713342428207397, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8562503457069397, + "num_tokens": 90248063.0, + "step": 2478 + }, + { + "epoch": 0.4603528319405757, + "grad_norm": 1.4960815906524658, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8534398674964905, + "num_tokens": 90291001.0, + "step": 2479 + }, + { + "epoch": 0.4605385329619313, + "grad_norm": 1.527403473854065, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8751720786094666, + "num_tokens": 90326994.0, + "step": 2480 + }, + { + "epoch": 0.46072423398328693, + "grad_norm": 1.594445824623108, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8507030010223389, + "num_tokens": 90363682.0, + "step": 2481 + }, + { + "epoch": 0.4609099350046425, + "grad_norm": 1.6741138696670532, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.850110650062561, + "num_tokens": 90397681.0, + "step": 2482 + }, + { + "epoch": 0.4610956360259981, + "grad_norm": 1.4290164709091187, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8565711379051208, + "num_tokens": 90438512.0, + "step": 2483 + }, + { + "epoch": 0.46128133704735375, + "grad_norm": 1.4306827783584595, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8781821131706238, + "num_tokens": 90478466.0, + "step": 2484 + }, + { + "epoch": 0.46146703806870937, + "grad_norm": 1.5361119508743286, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8607839941978455, + "num_tokens": 90514150.0, + "step": 2485 + }, + { + "epoch": 0.461652739090065, + "grad_norm": 1.6189426183700562, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8457157611846924, + "num_tokens": 90552532.0, + "step": 2486 + }, + { + "epoch": 0.4618384401114206, + "grad_norm": 1.542263150215149, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8645250201225281, + "num_tokens": 90586608.0, + "step": 2487 + }, + { + "epoch": 0.46202414113277623, + "grad_norm": 1.549036979675293, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.847763180732727, + "num_tokens": 90625601.0, + "step": 2488 + }, + { + "epoch": 0.46220984215413186, + "grad_norm": 1.7135837078094482, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8443487882614136, + "num_tokens": 90660975.0, + "step": 2489 + }, + { + "epoch": 0.4623955431754875, + "grad_norm": 1.5573620796203613, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8613469004631042, + "num_tokens": 90698547.0, + "step": 2490 + }, + { + "epoch": 0.4625812441968431, + "grad_norm": 1.5633556842803955, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8615565896034241, + "num_tokens": 90730514.0, + "step": 2491 + }, + { + "epoch": 0.4627669452181987, + "grad_norm": 1.5744251012802124, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.855626106262207, + "num_tokens": 90768132.0, + "step": 2492 + }, + { + "epoch": 0.46295264623955434, + "grad_norm": 1.5244237184524536, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8717405796051025, + "num_tokens": 90803467.0, + "step": 2493 + }, + { + "epoch": 0.4631383472609099, + "grad_norm": 1.4787732362747192, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8569285273551941, + "num_tokens": 90842140.0, + "step": 2494 + }, + { + "epoch": 0.46332404828226553, + "grad_norm": 1.5285563468933105, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8528745770454407, + "num_tokens": 90878544.0, + "step": 2495 + }, + { + "epoch": 0.46350974930362115, + "grad_norm": 1.6132144927978516, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8681436777114868, + "num_tokens": 90912745.0, + "step": 2496 + }, + { + "epoch": 0.4636954503249768, + "grad_norm": 1.3991472721099854, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8664805293083191, + "num_tokens": 90952332.0, + "step": 2497 + }, + { + "epoch": 0.4638811513463324, + "grad_norm": 1.4373910427093506, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8793883919715881, + "num_tokens": 90988414.0, + "step": 2498 + }, + { + "epoch": 0.464066852367688, + "grad_norm": 1.54072904586792, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8636236786842346, + "num_tokens": 91024698.0, + "step": 2499 + }, + { + "epoch": 0.46425255338904364, + "grad_norm": 1.4072608947753906, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.870139479637146, + "num_tokens": 91064461.0, + "step": 2500 + }, + { + "epoch": 0.46443825441039926, + "grad_norm": 1.6101226806640625, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8492509722709656, + "num_tokens": 91099107.0, + "step": 2501 + }, + { + "epoch": 0.4646239554317549, + "grad_norm": 1.551931619644165, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8421821594238281, + "num_tokens": 91136333.0, + "step": 2502 + }, + { + "epoch": 0.4648096564531105, + "grad_norm": 1.4819918870925903, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8703997135162354, + "num_tokens": 91172788.0, + "step": 2503 + }, + { + "epoch": 0.46499535747446613, + "grad_norm": 1.6031615734100342, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8364839553833008, + "num_tokens": 91212851.0, + "step": 2504 + }, + { + "epoch": 0.46518105849582175, + "grad_norm": 1.6229850053787231, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8640745282173157, + "num_tokens": 91244647.0, + "step": 2505 + }, + { + "epoch": 0.4653667595171773, + "grad_norm": 1.687352180480957, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8780158162117004, + "num_tokens": 91274907.0, + "step": 2506 + }, + { + "epoch": 0.46555246053853294, + "grad_norm": 1.504157543182373, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8495910167694092, + "num_tokens": 91314638.0, + "step": 2507 + }, + { + "epoch": 0.46573816155988856, + "grad_norm": 1.4487861394882202, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8716650605201721, + "num_tokens": 91354233.0, + "step": 2508 + }, + { + "epoch": 0.4659238625812442, + "grad_norm": 1.6646854877471924, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8668684959411621, + "num_tokens": 91386175.0, + "step": 2509 + }, + { + "epoch": 0.4661095636025998, + "grad_norm": 1.585359811782837, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8748981952667236, + "num_tokens": 91424273.0, + "step": 2510 + }, + { + "epoch": 0.46629526462395543, + "grad_norm": 1.5222389698028564, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.865944504737854, + "num_tokens": 91461085.0, + "step": 2511 + }, + { + "epoch": 0.46648096564531105, + "grad_norm": 1.5412973165512085, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8583757281303406, + "num_tokens": 91501271.0, + "step": 2512 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.686880350112915, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8429290056228638, + "num_tokens": 91536728.0, + "step": 2513 + }, + { + "epoch": 0.4668523676880223, + "grad_norm": 1.5894336700439453, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8648382425308228, + "num_tokens": 91572715.0, + "step": 2514 + }, + { + "epoch": 0.4670380687093779, + "grad_norm": 1.576601505279541, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8479723334312439, + "num_tokens": 91611982.0, + "step": 2515 + }, + { + "epoch": 0.46722376973073354, + "grad_norm": 1.476226568222046, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8678057789802551, + "num_tokens": 91648844.0, + "step": 2516 + }, + { + "epoch": 0.46740947075208916, + "grad_norm": 1.5990526676177979, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8591604232788086, + "num_tokens": 91683267.0, + "step": 2517 + }, + { + "epoch": 0.46759517177344473, + "grad_norm": 1.4603983163833618, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8743733763694763, + "num_tokens": 91724684.0, + "step": 2518 + }, + { + "epoch": 0.46778087279480035, + "grad_norm": 1.5331971645355225, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.845738410949707, + "num_tokens": 91763049.0, + "step": 2519 + }, + { + "epoch": 0.467966573816156, + "grad_norm": 1.6606050729751587, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8573470115661621, + "num_tokens": 91796460.0, + "step": 2520 + }, + { + "epoch": 0.4681522748375116, + "grad_norm": 1.6871453523635864, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8522096872329712, + "num_tokens": 91828879.0, + "step": 2521 + }, + { + "epoch": 0.4683379758588672, + "grad_norm": 1.6645382642745972, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8585377931594849, + "num_tokens": 91865205.0, + "step": 2522 + }, + { + "epoch": 0.46852367688022284, + "grad_norm": 1.5122753381729126, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8545400500297546, + "num_tokens": 91905659.0, + "step": 2523 + }, + { + "epoch": 0.46870937790157846, + "grad_norm": 1.6043506860733032, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8572984337806702, + "num_tokens": 91941894.0, + "step": 2524 + }, + { + "epoch": 0.4688950789229341, + "grad_norm": 1.542724847793579, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8699181079864502, + "num_tokens": 91976187.0, + "step": 2525 + }, + { + "epoch": 0.4690807799442897, + "grad_norm": 1.4499289989471436, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8519721627235413, + "num_tokens": 92018483.0, + "step": 2526 + }, + { + "epoch": 0.4692664809656453, + "grad_norm": 1.500524878501892, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.862659215927124, + "num_tokens": 92056894.0, + "step": 2527 + }, + { + "epoch": 0.46945218198700095, + "grad_norm": 1.6519882678985596, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8532604575157166, + "num_tokens": 92090823.0, + "step": 2528 + }, + { + "epoch": 0.46963788300835657, + "grad_norm": 1.6911592483520508, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8665818572044373, + "num_tokens": 92126384.0, + "step": 2529 + }, + { + "epoch": 0.46982358402971214, + "grad_norm": 1.617491364479065, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.863916277885437, + "num_tokens": 92160660.0, + "step": 2530 + }, + { + "epoch": 0.47000928505106776, + "grad_norm": 1.4972249269485474, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8635990619659424, + "num_tokens": 92200142.0, + "step": 2531 + }, + { + "epoch": 0.4701949860724234, + "grad_norm": 1.4996660947799683, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8408147096633911, + "num_tokens": 92242492.0, + "step": 2532 + }, + { + "epoch": 0.470380687093779, + "grad_norm": 1.572730302810669, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8716586828231812, + "num_tokens": 92278727.0, + "step": 2533 + }, + { + "epoch": 0.4705663881151346, + "grad_norm": 1.6051511764526367, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8434126973152161, + "num_tokens": 92319300.0, + "step": 2534 + }, + { + "epoch": 0.47075208913649025, + "grad_norm": 1.6628391742706299, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8524510264396667, + "num_tokens": 92352330.0, + "step": 2535 + }, + { + "epoch": 0.47093779015784587, + "grad_norm": 1.5604697465896606, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8659136891365051, + "num_tokens": 92387040.0, + "step": 2536 + }, + { + "epoch": 0.4711234911792015, + "grad_norm": 1.5117665529251099, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8610442876815796, + "num_tokens": 92425507.0, + "step": 2537 + }, + { + "epoch": 0.4713091922005571, + "grad_norm": 1.4619885683059692, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8474719524383545, + "num_tokens": 92467749.0, + "step": 2538 + }, + { + "epoch": 0.47149489322191274, + "grad_norm": 1.4560742378234863, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8675987720489502, + "num_tokens": 92506734.0, + "step": 2539 + }, + { + "epoch": 0.47168059424326836, + "grad_norm": 1.617261528968811, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.859982430934906, + "num_tokens": 92544757.0, + "step": 2540 + }, + { + "epoch": 0.471866295264624, + "grad_norm": 1.5060399770736694, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8581963777542114, + "num_tokens": 92585674.0, + "step": 2541 + }, + { + "epoch": 0.47205199628597955, + "grad_norm": 1.489582896232605, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8676808476448059, + "num_tokens": 92625843.0, + "step": 2542 + }, + { + "epoch": 0.47223769730733517, + "grad_norm": 1.4403071403503418, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8779009580612183, + "num_tokens": 92662835.0, + "step": 2543 + }, + { + "epoch": 0.4724233983286908, + "grad_norm": 1.6030405759811401, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8639336824417114, + "num_tokens": 92697141.0, + "step": 2544 + }, + { + "epoch": 0.4726090993500464, + "grad_norm": 1.6567418575286865, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8720943927764893, + "num_tokens": 92729964.0, + "step": 2545 + }, + { + "epoch": 0.47279480037140204, + "grad_norm": 1.5719239711761475, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8707388639450073, + "num_tokens": 92763816.0, + "step": 2546 + }, + { + "epoch": 0.47298050139275766, + "grad_norm": 1.667629599571228, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8525793552398682, + "num_tokens": 92798878.0, + "step": 2547 + }, + { + "epoch": 0.4731662024141133, + "grad_norm": 1.5832138061523438, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.872534990310669, + "num_tokens": 92832525.0, + "step": 2548 + }, + { + "epoch": 0.4733519034354689, + "grad_norm": 1.4998689889907837, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8754237294197083, + "num_tokens": 92868941.0, + "step": 2549 + }, + { + "epoch": 0.4735376044568245, + "grad_norm": 1.4080153703689575, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8626840114593506, + "num_tokens": 92910441.0, + "step": 2550 + }, + { + "epoch": 0.47372330547818015, + "grad_norm": 1.5126831531524658, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8758020401000977, + "num_tokens": 92949580.0, + "step": 2551 + }, + { + "epoch": 0.47390900649953577, + "grad_norm": 1.504909873008728, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8617452383041382, + "num_tokens": 92989502.0, + "step": 2552 + }, + { + "epoch": 0.4740947075208914, + "grad_norm": 1.4802627563476562, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8707950711250305, + "num_tokens": 93022083.0, + "step": 2553 + }, + { + "epoch": 0.47428040854224696, + "grad_norm": 1.4957711696624756, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8735285997390747, + "num_tokens": 93058531.0, + "step": 2554 + }, + { + "epoch": 0.4744661095636026, + "grad_norm": 1.4287171363830566, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.868865966796875, + "num_tokens": 93099431.0, + "step": 2555 + }, + { + "epoch": 0.4746518105849582, + "grad_norm": 1.5171610116958618, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8698511123657227, + "num_tokens": 93133747.0, + "step": 2556 + }, + { + "epoch": 0.4748375116063138, + "grad_norm": 1.5506856441497803, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8540194034576416, + "num_tokens": 93172977.0, + "step": 2557 + }, + { + "epoch": 0.47502321262766944, + "grad_norm": 1.5001572370529175, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8491888046264648, + "num_tokens": 93216273.0, + "step": 2558 + }, + { + "epoch": 0.47520891364902507, + "grad_norm": 1.6822903156280518, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8619193434715271, + "num_tokens": 93250329.0, + "step": 2559 + }, + { + "epoch": 0.4753946146703807, + "grad_norm": 1.5483362674713135, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8592251539230347, + "num_tokens": 93288459.0, + "step": 2560 + }, + { + "epoch": 0.4755803156917363, + "grad_norm": 1.5135979652404785, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8577499389648438, + "num_tokens": 93328068.0, + "step": 2561 + }, + { + "epoch": 0.47576601671309193, + "grad_norm": 1.5219072103500366, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8718434572219849, + "num_tokens": 93365081.0, + "step": 2562 + }, + { + "epoch": 0.47595171773444755, + "grad_norm": 1.6021668910980225, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8477373123168945, + "num_tokens": 93399117.0, + "step": 2563 + }, + { + "epoch": 0.4761374187558032, + "grad_norm": 1.5605993270874023, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8406877517700195, + "num_tokens": 93440307.0, + "step": 2564 + }, + { + "epoch": 0.4763231197771588, + "grad_norm": 1.588229775428772, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8645240068435669, + "num_tokens": 93474251.0, + "step": 2565 + }, + { + "epoch": 0.47650882079851437, + "grad_norm": 1.4222569465637207, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8595613241195679, + "num_tokens": 93512313.0, + "step": 2566 + }, + { + "epoch": 0.47669452181987, + "grad_norm": 1.5134764909744263, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8654088973999023, + "num_tokens": 93550744.0, + "step": 2567 + }, + { + "epoch": 0.4768802228412256, + "grad_norm": 1.615021824836731, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8550719022750854, + "num_tokens": 93588986.0, + "step": 2568 + }, + { + "epoch": 0.47706592386258123, + "grad_norm": 1.5941720008850098, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8479381799697876, + "num_tokens": 93624351.0, + "step": 2569 + }, + { + "epoch": 0.47725162488393685, + "grad_norm": 1.6868113279342651, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8518446087837219, + "num_tokens": 93658092.0, + "step": 2570 + }, + { + "epoch": 0.4774373259052925, + "grad_norm": 1.6923152208328247, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8370701670646667, + "num_tokens": 93692257.0, + "step": 2571 + }, + { + "epoch": 0.4776230269266481, + "grad_norm": 1.4432913064956665, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8808990716934204, + "num_tokens": 93728262.0, + "step": 2572 + }, + { + "epoch": 0.4778087279480037, + "grad_norm": 1.5907649993896484, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8688668608665466, + "num_tokens": 93762652.0, + "step": 2573 + }, + { + "epoch": 0.47799442896935934, + "grad_norm": 1.5834110975265503, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8821862936019897, + "num_tokens": 93797007.0, + "step": 2574 + }, + { + "epoch": 0.47818012999071496, + "grad_norm": 1.6982089281082153, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8653268814086914, + "num_tokens": 93828372.0, + "step": 2575 + }, + { + "epoch": 0.4783658310120706, + "grad_norm": 1.7007842063903809, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8607044219970703, + "num_tokens": 93858937.0, + "step": 2576 + }, + { + "epoch": 0.4785515320334262, + "grad_norm": 1.6917023658752441, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8494308590888977, + "num_tokens": 93892427.0, + "step": 2577 + }, + { + "epoch": 0.4787372330547818, + "grad_norm": 1.620012640953064, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8593613505363464, + "num_tokens": 93927782.0, + "step": 2578 + }, + { + "epoch": 0.4789229340761374, + "grad_norm": 1.4182720184326172, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8586567044258118, + "num_tokens": 93970843.0, + "step": 2579 + }, + { + "epoch": 0.479108635097493, + "grad_norm": 1.5661647319793701, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8652483224868774, + "num_tokens": 94002903.0, + "step": 2580 + }, + { + "epoch": 0.47929433611884864, + "grad_norm": 1.4280436038970947, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8733114004135132, + "num_tokens": 94039429.0, + "step": 2581 + }, + { + "epoch": 0.47948003714020426, + "grad_norm": 1.708574891090393, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8656747937202454, + "num_tokens": 94072260.0, + "step": 2582 + }, + { + "epoch": 0.4796657381615599, + "grad_norm": 1.5877553224563599, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8491969108581543, + "num_tokens": 94107633.0, + "step": 2583 + }, + { + "epoch": 0.4798514391829155, + "grad_norm": 1.5411279201507568, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8454189300537109, + "num_tokens": 94148500.0, + "step": 2584 + }, + { + "epoch": 0.48003714020427113, + "grad_norm": 1.5059871673583984, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8689257502555847, + "num_tokens": 94186537.0, + "step": 2585 + }, + { + "epoch": 0.48022284122562675, + "grad_norm": 1.3875292539596558, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.87015300989151, + "num_tokens": 94227096.0, + "step": 2586 + }, + { + "epoch": 0.4804085422469824, + "grad_norm": 1.35344660282135, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8809949159622192, + "num_tokens": 94265655.0, + "step": 2587 + }, + { + "epoch": 0.480594243268338, + "grad_norm": 1.5122007131576538, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8637843728065491, + "num_tokens": 94302842.0, + "step": 2588 + }, + { + "epoch": 0.4807799442896936, + "grad_norm": 1.5877481698989868, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8666948080062866, + "num_tokens": 94337593.0, + "step": 2589 + }, + { + "epoch": 0.4809656453110492, + "grad_norm": 1.5869734287261963, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8495298027992249, + "num_tokens": 94374418.0, + "step": 2590 + }, + { + "epoch": 0.4811513463324048, + "grad_norm": 1.6165006160736084, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8448827266693115, + "num_tokens": 94410055.0, + "step": 2591 + }, + { + "epoch": 0.4813370473537604, + "grad_norm": 1.5532104969024658, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8596533536911011, + "num_tokens": 94449128.0, + "step": 2592 + }, + { + "epoch": 0.48152274837511605, + "grad_norm": 1.5983203649520874, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8645251989364624, + "num_tokens": 94480263.0, + "step": 2593 + }, + { + "epoch": 0.48170844939647167, + "grad_norm": 1.4523595571517944, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8590028285980225, + "num_tokens": 94522243.0, + "step": 2594 + }, + { + "epoch": 0.4818941504178273, + "grad_norm": 1.5049991607666016, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8593659996986389, + "num_tokens": 94560559.0, + "step": 2595 + }, + { + "epoch": 0.4820798514391829, + "grad_norm": 1.6062933206558228, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8809648752212524, + "num_tokens": 94595504.0, + "step": 2596 + }, + { + "epoch": 0.48226555246053854, + "grad_norm": 1.5146923065185547, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8633805513381958, + "num_tokens": 94634902.0, + "step": 2597 + }, + { + "epoch": 0.48245125348189416, + "grad_norm": 1.647903561592102, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8561915159225464, + "num_tokens": 94671866.0, + "step": 2598 + }, + { + "epoch": 0.4826369545032498, + "grad_norm": 1.5615978240966797, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8543348908424377, + "num_tokens": 94710002.0, + "step": 2599 + }, + { + "epoch": 0.4828226555246054, + "grad_norm": 1.5020031929016113, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8636423349380493, + "num_tokens": 94749739.0, + "step": 2600 + }, + { + "epoch": 0.483008356545961, + "grad_norm": 1.58348548412323, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8442374467849731, + "num_tokens": 94788719.0, + "step": 2601 + }, + { + "epoch": 0.4831940575673166, + "grad_norm": 1.6153806447982788, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8649575710296631, + "num_tokens": 94820344.0, + "step": 2602 + }, + { + "epoch": 0.4833797585886722, + "grad_norm": 1.5363647937774658, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8828285932540894, + "num_tokens": 94858902.0, + "step": 2603 + }, + { + "epoch": 0.48356545961002784, + "grad_norm": 1.6937423944473267, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8477532863616943, + "num_tokens": 94890109.0, + "step": 2604 + }, + { + "epoch": 0.48375116063138346, + "grad_norm": 1.6630527973175049, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8683964014053345, + "num_tokens": 94920141.0, + "step": 2605 + }, + { + "epoch": 0.4839368616527391, + "grad_norm": 1.529544711112976, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8832979798316956, + "num_tokens": 94954952.0, + "step": 2606 + }, + { + "epoch": 0.4841225626740947, + "grad_norm": 1.4991296529769897, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8661047220230103, + "num_tokens": 94994782.0, + "step": 2607 + }, + { + "epoch": 0.4843082636954503, + "grad_norm": 1.6228444576263428, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8640632033348083, + "num_tokens": 95028033.0, + "step": 2608 + }, + { + "epoch": 0.48449396471680595, + "grad_norm": 1.4004032611846924, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8682471513748169, + "num_tokens": 95070577.0, + "step": 2609 + }, + { + "epoch": 0.48467966573816157, + "grad_norm": 1.5615888833999634, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8562930822372437, + "num_tokens": 95107260.0, + "step": 2610 + }, + { + "epoch": 0.4848653667595172, + "grad_norm": 1.7140053510665894, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8627305626869202, + "num_tokens": 95140787.0, + "step": 2611 + }, + { + "epoch": 0.4850510677808728, + "grad_norm": 1.5193474292755127, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8637828826904297, + "num_tokens": 95179403.0, + "step": 2612 + }, + { + "epoch": 0.48523676880222844, + "grad_norm": 1.5911734104156494, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.873746395111084, + "num_tokens": 95212708.0, + "step": 2613 + }, + { + "epoch": 0.48542246982358406, + "grad_norm": 1.555022120475769, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8657838702201843, + "num_tokens": 95246266.0, + "step": 2614 + }, + { + "epoch": 0.4856081708449396, + "grad_norm": 1.4844846725463867, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8525882959365845, + "num_tokens": 95287053.0, + "step": 2615 + }, + { + "epoch": 0.48579387186629525, + "grad_norm": 1.509222149848938, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8633451461791992, + "num_tokens": 95321610.0, + "step": 2616 + }, + { + "epoch": 0.48597957288765087, + "grad_norm": 1.8301646709442139, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8634932637214661, + "num_tokens": 95351386.0, + "step": 2617 + }, + { + "epoch": 0.4861652739090065, + "grad_norm": 1.706959843635559, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8668324947357178, + "num_tokens": 95381806.0, + "step": 2618 + }, + { + "epoch": 0.4863509749303621, + "grad_norm": 1.614105224609375, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8588118553161621, + "num_tokens": 95414362.0, + "step": 2619 + }, + { + "epoch": 0.48653667595171773, + "grad_norm": 1.5914300680160522, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.856400191783905, + "num_tokens": 95448386.0, + "step": 2620 + }, + { + "epoch": 0.48672237697307336, + "grad_norm": 1.613061785697937, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8617228865623474, + "num_tokens": 95480862.0, + "step": 2621 + }, + { + "epoch": 0.486908077994429, + "grad_norm": 1.4970673322677612, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8561310768127441, + "num_tokens": 95518951.0, + "step": 2622 + }, + { + "epoch": 0.4870937790157846, + "grad_norm": 1.6140938997268677, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8570522665977478, + "num_tokens": 95554894.0, + "step": 2623 + }, + { + "epoch": 0.4872794800371402, + "grad_norm": 1.4956549406051636, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8683738112449646, + "num_tokens": 95590828.0, + "step": 2624 + }, + { + "epoch": 0.48746518105849584, + "grad_norm": 1.377957820892334, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8744533061981201, + "num_tokens": 95629714.0, + "step": 2625 + }, + { + "epoch": 0.48765088207985147, + "grad_norm": 1.6647123098373413, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8606499433517456, + "num_tokens": 95660450.0, + "step": 2626 + }, + { + "epoch": 0.48783658310120703, + "grad_norm": 1.4528586864471436, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8647286891937256, + "num_tokens": 95702734.0, + "step": 2627 + }, + { + "epoch": 0.48802228412256266, + "grad_norm": 1.41800057888031, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8619030714035034, + "num_tokens": 95743007.0, + "step": 2628 + }, + { + "epoch": 0.4882079851439183, + "grad_norm": 1.5197261571884155, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8706401586532593, + "num_tokens": 95776943.0, + "step": 2629 + }, + { + "epoch": 0.4883936861652739, + "grad_norm": 1.6444602012634277, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8439799547195435, + "num_tokens": 95814468.0, + "step": 2630 + }, + { + "epoch": 0.4885793871866295, + "grad_norm": 1.5842152833938599, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8730172514915466, + "num_tokens": 95848044.0, + "step": 2631 + }, + { + "epoch": 0.48876508820798514, + "grad_norm": 1.5063170194625854, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8582428693771362, + "num_tokens": 95883769.0, + "step": 2632 + }, + { + "epoch": 0.48895078922934077, + "grad_norm": 1.5716450214385986, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8562296628952026, + "num_tokens": 95923399.0, + "step": 2633 + }, + { + "epoch": 0.4891364902506964, + "grad_norm": 1.5817314386367798, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8563527464866638, + "num_tokens": 95959123.0, + "step": 2634 + }, + { + "epoch": 0.489322191272052, + "grad_norm": 1.6642539501190186, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8422225117683411, + "num_tokens": 95994717.0, + "step": 2635 + }, + { + "epoch": 0.48950789229340763, + "grad_norm": 1.5377850532531738, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8632968664169312, + "num_tokens": 96030405.0, + "step": 2636 + }, + { + "epoch": 0.48969359331476325, + "grad_norm": 1.633778691291809, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8685691952705383, + "num_tokens": 96068323.0, + "step": 2637 + }, + { + "epoch": 0.4898792943361189, + "grad_norm": 1.587335467338562, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8605589866638184, + "num_tokens": 96102008.0, + "step": 2638 + }, + { + "epoch": 0.49006499535747444, + "grad_norm": 1.6691436767578125, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.856291651725769, + "num_tokens": 96138964.0, + "step": 2639 + }, + { + "epoch": 0.49025069637883006, + "grad_norm": 1.6855727434158325, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8650964498519897, + "num_tokens": 96168336.0, + "step": 2640 + }, + { + "epoch": 0.4904363974001857, + "grad_norm": 1.5893727540969849, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8602489233016968, + "num_tokens": 96202782.0, + "step": 2641 + }, + { + "epoch": 0.4906220984215413, + "grad_norm": 1.714888334274292, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8717594742774963, + "num_tokens": 96232770.0, + "step": 2642 + }, + { + "epoch": 0.49080779944289693, + "grad_norm": 1.5116764307022095, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8514810800552368, + "num_tokens": 96269176.0, + "step": 2643 + }, + { + "epoch": 0.49099350046425255, + "grad_norm": 1.387861967086792, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8696776628494263, + "num_tokens": 96312077.0, + "step": 2644 + }, + { + "epoch": 0.4911792014856082, + "grad_norm": 1.4822571277618408, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8651025295257568, + "num_tokens": 96351113.0, + "step": 2645 + }, + { + "epoch": 0.4913649025069638, + "grad_norm": 1.5421255826950073, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8703421950340271, + "num_tokens": 96388286.0, + "step": 2646 + }, + { + "epoch": 0.4915506035283194, + "grad_norm": 1.4632036685943604, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8622846603393555, + "num_tokens": 96428740.0, + "step": 2647 + }, + { + "epoch": 0.49173630454967504, + "grad_norm": 1.5627470016479492, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8565033078193665, + "num_tokens": 96466528.0, + "step": 2648 + }, + { + "epoch": 0.49192200557103066, + "grad_norm": 1.7038029432296753, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8487241864204407, + "num_tokens": 96503679.0, + "step": 2649 + }, + { + "epoch": 0.4921077065923863, + "grad_norm": 1.6099759340286255, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8648512363433838, + "num_tokens": 96537468.0, + "step": 2650 + }, + { + "epoch": 0.49229340761374185, + "grad_norm": 1.6360571384429932, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8507115840911865, + "num_tokens": 96574796.0, + "step": 2651 + }, + { + "epoch": 0.4924791086350975, + "grad_norm": 1.5991958379745483, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8663920164108276, + "num_tokens": 96607999.0, + "step": 2652 + }, + { + "epoch": 0.4926648096564531, + "grad_norm": 1.5196716785430908, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8595336675643921, + "num_tokens": 96644096.0, + "step": 2653 + }, + { + "epoch": 0.4928505106778087, + "grad_norm": 1.7971845865249634, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8486001491546631, + "num_tokens": 96676364.0, + "step": 2654 + }, + { + "epoch": 0.49303621169916434, + "grad_norm": 1.7054013013839722, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8480225801467896, + "num_tokens": 96709457.0, + "step": 2655 + }, + { + "epoch": 0.49322191272051996, + "grad_norm": 1.522616982460022, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8630677461624146, + "num_tokens": 96747568.0, + "step": 2656 + }, + { + "epoch": 0.4934076137418756, + "grad_norm": 1.5637086629867554, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8430148363113403, + "num_tokens": 96791043.0, + "step": 2657 + }, + { + "epoch": 0.4935933147632312, + "grad_norm": 1.3946765661239624, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8626910448074341, + "num_tokens": 96834393.0, + "step": 2658 + }, + { + "epoch": 0.4937790157845868, + "grad_norm": 1.6323505640029907, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8677900433540344, + "num_tokens": 96867113.0, + "step": 2659 + }, + { + "epoch": 0.49396471680594245, + "grad_norm": 1.4766311645507812, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8746609687805176, + "num_tokens": 96904951.0, + "step": 2660 + }, + { + "epoch": 0.49415041782729807, + "grad_norm": 1.6191129684448242, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8454841375350952, + "num_tokens": 96941861.0, + "step": 2661 + }, + { + "epoch": 0.4943361188486537, + "grad_norm": 1.4629967212677002, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8725250959396362, + "num_tokens": 96982598.0, + "step": 2662 + }, + { + "epoch": 0.49452181987000926, + "grad_norm": 1.550772786140442, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8699184656143188, + "num_tokens": 97017690.0, + "step": 2663 + }, + { + "epoch": 0.4947075208913649, + "grad_norm": 1.5242756605148315, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8684830665588379, + "num_tokens": 97056785.0, + "step": 2664 + }, + { + "epoch": 0.4948932219127205, + "grad_norm": 1.4671926498413086, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8749480843544006, + "num_tokens": 97091798.0, + "step": 2665 + }, + { + "epoch": 0.4950789229340761, + "grad_norm": 1.6689865589141846, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8485708832740784, + "num_tokens": 97126135.0, + "step": 2666 + }, + { + "epoch": 0.49526462395543175, + "grad_norm": 1.473095178604126, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8636297583580017, + "num_tokens": 97167189.0, + "step": 2667 + }, + { + "epoch": 0.49545032497678737, + "grad_norm": 1.8017362356185913, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8622563481330872, + "num_tokens": 97194266.0, + "step": 2668 + }, + { + "epoch": 0.495636025998143, + "grad_norm": 1.657605767250061, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8655024766921997, + "num_tokens": 97229970.0, + "step": 2669 + }, + { + "epoch": 0.4958217270194986, + "grad_norm": 1.511227011680603, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8589045405387878, + "num_tokens": 97269510.0, + "step": 2670 + }, + { + "epoch": 0.49600742804085424, + "grad_norm": 1.6097774505615234, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8519450426101685, + "num_tokens": 97304473.0, + "step": 2671 + }, + { + "epoch": 0.49619312906220986, + "grad_norm": 1.7976237535476685, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8508009314537048, + "num_tokens": 97340360.0, + "step": 2672 + }, + { + "epoch": 0.4963788300835655, + "grad_norm": 1.622033715248108, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8514957427978516, + "num_tokens": 97379008.0, + "step": 2673 + }, + { + "epoch": 0.4965645311049211, + "grad_norm": 1.6774301528930664, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8470853567123413, + "num_tokens": 97413206.0, + "step": 2674 + }, + { + "epoch": 0.49675023212627667, + "grad_norm": 1.6120136976242065, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8650840520858765, + "num_tokens": 97444582.0, + "step": 2675 + }, + { + "epoch": 0.4969359331476323, + "grad_norm": 1.535586953163147, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8638567328453064, + "num_tokens": 97483101.0, + "step": 2676 + }, + { + "epoch": 0.4971216341689879, + "grad_norm": 1.51872718334198, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8595521450042725, + "num_tokens": 97517145.0, + "step": 2677 + }, + { + "epoch": 0.49730733519034354, + "grad_norm": 1.7155332565307617, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8664071559906006, + "num_tokens": 97546424.0, + "step": 2678 + }, + { + "epoch": 0.49749303621169916, + "grad_norm": 1.3950377702713013, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8731451630592346, + "num_tokens": 97588688.0, + "step": 2679 + }, + { + "epoch": 0.4976787372330548, + "grad_norm": 1.7094950675964355, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.833785355091095, + "num_tokens": 97622449.0, + "step": 2680 + }, + { + "epoch": 0.4978644382544104, + "grad_norm": 1.481602668762207, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8602922558784485, + "num_tokens": 97661272.0, + "step": 2681 + }, + { + "epoch": 0.498050139275766, + "grad_norm": 1.5589288473129272, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8662084341049194, + "num_tokens": 97695687.0, + "step": 2682 + }, + { + "epoch": 0.49823584029712165, + "grad_norm": 1.5659675598144531, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8692084550857544, + "num_tokens": 97730784.0, + "step": 2683 + }, + { + "epoch": 0.49842154131847727, + "grad_norm": 1.5698002576828003, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8563963174819946, + "num_tokens": 97765692.0, + "step": 2684 + }, + { + "epoch": 0.4986072423398329, + "grad_norm": 1.6447175741195679, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8482261896133423, + "num_tokens": 97801289.0, + "step": 2685 + }, + { + "epoch": 0.4987929433611885, + "grad_norm": 1.6225836277008057, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8623321056365967, + "num_tokens": 97836475.0, + "step": 2686 + }, + { + "epoch": 0.4989786443825441, + "grad_norm": 1.6070244312286377, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.86631178855896, + "num_tokens": 97870818.0, + "step": 2687 + }, + { + "epoch": 0.4991643454038997, + "grad_norm": 1.5683907270431519, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8536003232002258, + "num_tokens": 97907187.0, + "step": 2688 + }, + { + "epoch": 0.4993500464252553, + "grad_norm": 1.5331279039382935, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8626481890678406, + "num_tokens": 97947033.0, + "step": 2689 + }, + { + "epoch": 0.49953574744661094, + "grad_norm": 1.6978060007095337, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8559743165969849, + "num_tokens": 97980408.0, + "step": 2690 + }, + { + "epoch": 0.49972144846796657, + "grad_norm": 1.643722653388977, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8625414371490479, + "num_tokens": 98012649.0, + "step": 2691 + }, + { + "epoch": 0.4999071494893222, + "grad_norm": 1.4365873336791992, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8567183613777161, + "num_tokens": 98055270.0, + "step": 2692 + }, + { + "epoch": 0.5000928505106778, + "grad_norm": 1.558657169342041, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8598536252975464, + "num_tokens": 98091578.0, + "step": 2693 + }, + { + "epoch": 0.5002785515320334, + "grad_norm": 1.5855069160461426, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8517730236053467, + "num_tokens": 98127056.0, + "step": 2694 + }, + { + "epoch": 0.500464252553389, + "grad_norm": 1.5319145917892456, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.856866717338562, + "num_tokens": 98163394.0, + "step": 2695 + }, + { + "epoch": 0.5006499535747446, + "grad_norm": 1.6273823976516724, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.853446364402771, + "num_tokens": 98198851.0, + "step": 2696 + }, + { + "epoch": 0.5008356545961002, + "grad_norm": 1.6577882766723633, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8469182848930359, + "num_tokens": 98236185.0, + "step": 2697 + }, + { + "epoch": 0.5010213556174559, + "grad_norm": 1.4733456373214722, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8824922442436218, + "num_tokens": 98270807.0, + "step": 2698 + }, + { + "epoch": 0.5012070566388115, + "grad_norm": 1.4505813121795654, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8842810392379761, + "num_tokens": 98306344.0, + "step": 2699 + }, + { + "epoch": 0.5013927576601671, + "grad_norm": 1.6101268529891968, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8712247014045715, + "num_tokens": 98339260.0, + "step": 2700 + }, + { + "epoch": 0.5015784586815227, + "grad_norm": 1.4614372253417969, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8672266006469727, + "num_tokens": 98378817.0, + "step": 2701 + }, + { + "epoch": 0.5017641597028784, + "grad_norm": 1.6830639839172363, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8837738633155823, + "num_tokens": 98406526.0, + "step": 2702 + }, + { + "epoch": 0.501949860724234, + "grad_norm": 1.5228792428970337, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8481419086456299, + "num_tokens": 98447836.0, + "step": 2703 + }, + { + "epoch": 0.5021355617455896, + "grad_norm": 1.4119384288787842, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8649187088012695, + "num_tokens": 98490774.0, + "step": 2704 + }, + { + "epoch": 0.5023212627669452, + "grad_norm": 1.4913755655288696, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8743972778320312, + "num_tokens": 98526130.0, + "step": 2705 + }, + { + "epoch": 0.5025069637883008, + "grad_norm": 1.5218316316604614, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8576009273529053, + "num_tokens": 98567196.0, + "step": 2706 + }, + { + "epoch": 0.5026926648096565, + "grad_norm": 1.3837441205978394, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8750504851341248, + "num_tokens": 98605880.0, + "step": 2707 + }, + { + "epoch": 0.5028783658310121, + "grad_norm": 1.5613774061203003, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8570946455001831, + "num_tokens": 98641669.0, + "step": 2708 + }, + { + "epoch": 0.5030640668523677, + "grad_norm": 1.5849837064743042, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.872225284576416, + "num_tokens": 98675204.0, + "step": 2709 + }, + { + "epoch": 0.5032497678737233, + "grad_norm": 1.613435983657837, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8645391464233398, + "num_tokens": 98707418.0, + "step": 2710 + }, + { + "epoch": 0.503435468895079, + "grad_norm": 1.8261181116104126, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8549489378929138, + "num_tokens": 98735150.0, + "step": 2711 + }, + { + "epoch": 0.5036211699164346, + "grad_norm": 1.657118320465088, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8698475956916809, + "num_tokens": 98767969.0, + "step": 2712 + }, + { + "epoch": 0.5038068709377902, + "grad_norm": 1.497411847114563, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8821256756782532, + "num_tokens": 98800963.0, + "step": 2713 + }, + { + "epoch": 0.5039925719591458, + "grad_norm": 1.610406517982483, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8701589107513428, + "num_tokens": 98833351.0, + "step": 2714 + }, + { + "epoch": 0.5041782729805014, + "grad_norm": 1.6146187782287598, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8646074533462524, + "num_tokens": 98865896.0, + "step": 2715 + }, + { + "epoch": 0.5043639740018571, + "grad_norm": 1.7426209449768066, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8494205474853516, + "num_tokens": 98896188.0, + "step": 2716 + }, + { + "epoch": 0.5045496750232126, + "grad_norm": 1.638550043106079, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8606477975845337, + "num_tokens": 98930223.0, + "step": 2717 + }, + { + "epoch": 0.5047353760445682, + "grad_norm": 1.518390417098999, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8500798344612122, + "num_tokens": 98968800.0, + "step": 2718 + }, + { + "epoch": 0.5049210770659238, + "grad_norm": 1.4495068788528442, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8690465092658997, + "num_tokens": 99006694.0, + "step": 2719 + }, + { + "epoch": 0.5051067780872794, + "grad_norm": 1.4817508459091187, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8703593015670776, + "num_tokens": 99047089.0, + "step": 2720 + }, + { + "epoch": 0.5052924791086351, + "grad_norm": 1.7594618797302246, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8667809963226318, + "num_tokens": 99077126.0, + "step": 2721 + }, + { + "epoch": 0.5054781801299907, + "grad_norm": 1.3667782545089722, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.874059796333313, + "num_tokens": 99119888.0, + "step": 2722 + }, + { + "epoch": 0.5056638811513463, + "grad_norm": 1.5315895080566406, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8703048229217529, + "num_tokens": 99155265.0, + "step": 2723 + }, + { + "epoch": 0.5058495821727019, + "grad_norm": 1.5577116012573242, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8718379735946655, + "num_tokens": 99187807.0, + "step": 2724 + }, + { + "epoch": 0.5060352831940576, + "grad_norm": 1.608828067779541, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8549425601959229, + "num_tokens": 99220867.0, + "step": 2725 + }, + { + "epoch": 0.5062209842154132, + "grad_norm": 1.4851043224334717, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8585813641548157, + "num_tokens": 99257916.0, + "step": 2726 + }, + { + "epoch": 0.5064066852367688, + "grad_norm": 1.4937056303024292, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8552006483078003, + "num_tokens": 99300187.0, + "step": 2727 + }, + { + "epoch": 0.5065923862581244, + "grad_norm": 1.4680988788604736, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8675824403762817, + "num_tokens": 99341325.0, + "step": 2728 + }, + { + "epoch": 0.50677808727948, + "grad_norm": 1.403704285621643, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8640316724777222, + "num_tokens": 99384019.0, + "step": 2729 + }, + { + "epoch": 0.5069637883008357, + "grad_norm": 1.616843581199646, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8542369604110718, + "num_tokens": 99418034.0, + "step": 2730 + }, + { + "epoch": 0.5071494893221913, + "grad_norm": 1.6072674989700317, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8658632040023804, + "num_tokens": 99451199.0, + "step": 2731 + }, + { + "epoch": 0.5073351903435469, + "grad_norm": 1.592301368713379, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8681769967079163, + "num_tokens": 99484453.0, + "step": 2732 + }, + { + "epoch": 0.5075208913649025, + "grad_norm": 1.4928513765335083, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8702093362808228, + "num_tokens": 99522325.0, + "step": 2733 + }, + { + "epoch": 0.5077065923862581, + "grad_norm": 1.435942530632019, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8696807622909546, + "num_tokens": 99560940.0, + "step": 2734 + }, + { + "epoch": 0.5078922934076138, + "grad_norm": 1.4797269105911255, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8666361570358276, + "num_tokens": 99599538.0, + "step": 2735 + }, + { + "epoch": 0.5080779944289694, + "grad_norm": 1.556023359298706, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8582234382629395, + "num_tokens": 99637101.0, + "step": 2736 + }, + { + "epoch": 0.508263695450325, + "grad_norm": 1.4650874137878418, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8680033683776855, + "num_tokens": 99675113.0, + "step": 2737 + }, + { + "epoch": 0.5084493964716806, + "grad_norm": 1.6538304090499878, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8720983266830444, + "num_tokens": 99708439.0, + "step": 2738 + }, + { + "epoch": 0.5086350974930363, + "grad_norm": 1.4931707382202148, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.867793619632721, + "num_tokens": 99747066.0, + "step": 2739 + }, + { + "epoch": 0.5088207985143919, + "grad_norm": 1.5357534885406494, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8673302531242371, + "num_tokens": 99781755.0, + "step": 2740 + }, + { + "epoch": 0.5090064995357474, + "grad_norm": 1.5789134502410889, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8680320382118225, + "num_tokens": 99817721.0, + "step": 2741 + }, + { + "epoch": 0.509192200557103, + "grad_norm": 1.4561220407485962, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8677823543548584, + "num_tokens": 99858416.0, + "step": 2742 + }, + { + "epoch": 0.5093779015784586, + "grad_norm": 1.4697493314743042, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8662622570991516, + "num_tokens": 99898023.0, + "step": 2743 + }, + { + "epoch": 0.5095636025998143, + "grad_norm": 1.5194655656814575, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8657873868942261, + "num_tokens": 99938086.0, + "step": 2744 + }, + { + "epoch": 0.5097493036211699, + "grad_norm": 1.77232027053833, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8390347957611084, + "num_tokens": 99968176.0, + "step": 2745 + }, + { + "epoch": 0.5099350046425255, + "grad_norm": 1.5324002504348755, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8739689588546753, + "num_tokens": 100003301.0, + "step": 2746 + }, + { + "epoch": 0.5101207056638811, + "grad_norm": 1.5272496938705444, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8635808229446411, + "num_tokens": 100040363.0, + "step": 2747 + }, + { + "epoch": 0.5103064066852367, + "grad_norm": 1.7700430154800415, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8554918169975281, + "num_tokens": 100071711.0, + "step": 2748 + }, + { + "epoch": 0.5104921077065924, + "grad_norm": 1.4796781539916992, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8663389086723328, + "num_tokens": 100113368.0, + "step": 2749 + }, + { + "epoch": 0.510677808727948, + "grad_norm": 1.6639717817306519, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8510935306549072, + "num_tokens": 100150258.0, + "step": 2750 + }, + { + "epoch": 0.5108635097493036, + "grad_norm": 1.6308315992355347, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8550184369087219, + "num_tokens": 100185766.0, + "step": 2751 + }, + { + "epoch": 0.5110492107706592, + "grad_norm": 1.385431170463562, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8689976930618286, + "num_tokens": 100228937.0, + "step": 2752 + }, + { + "epoch": 0.5112349117920149, + "grad_norm": 1.5734851360321045, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8637823462486267, + "num_tokens": 100265207.0, + "step": 2753 + }, + { + "epoch": 0.5114206128133705, + "grad_norm": 1.493384599685669, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.864579439163208, + "num_tokens": 100300997.0, + "step": 2754 + }, + { + "epoch": 0.5116063138347261, + "grad_norm": 1.598172903060913, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.862183690071106, + "num_tokens": 100337251.0, + "step": 2755 + }, + { + "epoch": 0.5117920148560817, + "grad_norm": 1.5449578762054443, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8544471263885498, + "num_tokens": 100373894.0, + "step": 2756 + }, + { + "epoch": 0.5119777158774373, + "grad_norm": 1.6334589719772339, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8793977499008179, + "num_tokens": 100401756.0, + "step": 2757 + }, + { + "epoch": 0.512163416898793, + "grad_norm": 1.4770736694335938, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8688068389892578, + "num_tokens": 100437441.0, + "step": 2758 + }, + { + "epoch": 0.5123491179201486, + "grad_norm": 1.502403736114502, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8525593876838684, + "num_tokens": 100475773.0, + "step": 2759 + }, + { + "epoch": 0.5125348189415042, + "grad_norm": 1.4871387481689453, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8616006970405579, + "num_tokens": 100516100.0, + "step": 2760 + }, + { + "epoch": 0.5127205199628598, + "grad_norm": 1.5147380828857422, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8698267936706543, + "num_tokens": 100552726.0, + "step": 2761 + }, + { + "epoch": 0.5129062209842155, + "grad_norm": 1.6457141637802124, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8542162179946899, + "num_tokens": 100586598.0, + "step": 2762 + }, + { + "epoch": 0.5130919220055711, + "grad_norm": 1.5228124856948853, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8648203611373901, + "num_tokens": 100621682.0, + "step": 2763 + }, + { + "epoch": 0.5132776230269267, + "grad_norm": 1.4765315055847168, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8696842193603516, + "num_tokens": 100658748.0, + "step": 2764 + }, + { + "epoch": 0.5134633240482822, + "grad_norm": 1.58063542842865, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8391885757446289, + "num_tokens": 100699665.0, + "step": 2765 + }, + { + "epoch": 0.5136490250696378, + "grad_norm": 1.5206748247146606, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8541914224624634, + "num_tokens": 100737821.0, + "step": 2766 + }, + { + "epoch": 0.5138347260909935, + "grad_norm": 1.5069849491119385, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8642673492431641, + "num_tokens": 100778050.0, + "step": 2767 + }, + { + "epoch": 0.5140204271123491, + "grad_norm": 1.4994670152664185, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8667005300521851, + "num_tokens": 100815451.0, + "step": 2768 + }, + { + "epoch": 0.5142061281337047, + "grad_norm": 1.3622653484344482, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8710605502128601, + "num_tokens": 100855584.0, + "step": 2769 + }, + { + "epoch": 0.5143918291550603, + "grad_norm": 1.4280352592468262, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8444489240646362, + "num_tokens": 100899531.0, + "step": 2770 + }, + { + "epoch": 0.5145775301764159, + "grad_norm": 1.731406331062317, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8715469837188721, + "num_tokens": 100931378.0, + "step": 2771 + }, + { + "epoch": 0.5147632311977716, + "grad_norm": 1.6199798583984375, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.86449134349823, + "num_tokens": 100969882.0, + "step": 2772 + }, + { + "epoch": 0.5149489322191272, + "grad_norm": 1.5037970542907715, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8685965538024902, + "num_tokens": 101006247.0, + "step": 2773 + }, + { + "epoch": 0.5151346332404828, + "grad_norm": 1.6023263931274414, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8671895265579224, + "num_tokens": 101039195.0, + "step": 2774 + }, + { + "epoch": 0.5153203342618384, + "grad_norm": 1.6164655685424805, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8627227544784546, + "num_tokens": 101071112.0, + "step": 2775 + }, + { + "epoch": 0.515506035283194, + "grad_norm": 1.4069099426269531, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8799586296081543, + "num_tokens": 101109248.0, + "step": 2776 + }, + { + "epoch": 0.5156917363045497, + "grad_norm": 1.5638638734817505, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8601120710372925, + "num_tokens": 101148247.0, + "step": 2777 + }, + { + "epoch": 0.5158774373259053, + "grad_norm": 1.529056191444397, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8710412979125977, + "num_tokens": 101188134.0, + "step": 2778 + }, + { + "epoch": 0.5160631383472609, + "grad_norm": 1.739214539527893, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8554878234863281, + "num_tokens": 101222515.0, + "step": 2779 + }, + { + "epoch": 0.5162488393686165, + "grad_norm": 1.4756391048431396, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8479979038238525, + "num_tokens": 101263811.0, + "step": 2780 + }, + { + "epoch": 0.5164345403899722, + "grad_norm": 1.4833413362503052, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8703489303588867, + "num_tokens": 101302286.0, + "step": 2781 + }, + { + "epoch": 0.5166202414113278, + "grad_norm": 1.676375150680542, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8613215684890747, + "num_tokens": 101333920.0, + "step": 2782 + }, + { + "epoch": 0.5168059424326834, + "grad_norm": 1.585735559463501, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8598635792732239, + "num_tokens": 101371991.0, + "step": 2783 + }, + { + "epoch": 0.516991643454039, + "grad_norm": 1.681166410446167, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8604587316513062, + "num_tokens": 101405207.0, + "step": 2784 + }, + { + "epoch": 0.5171773444753947, + "grad_norm": 1.5443086624145508, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8717571496963501, + "num_tokens": 101440845.0, + "step": 2785 + }, + { + "epoch": 0.5173630454967503, + "grad_norm": 1.463988184928894, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8740809559822083, + "num_tokens": 101481864.0, + "step": 2786 + }, + { + "epoch": 0.5175487465181059, + "grad_norm": 1.5295830965042114, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8664327263832092, + "num_tokens": 101520756.0, + "step": 2787 + }, + { + "epoch": 0.5177344475394615, + "grad_norm": 1.616368293762207, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8613173365592957, + "num_tokens": 101558052.0, + "step": 2788 + }, + { + "epoch": 0.5179201485608171, + "grad_norm": 1.5270079374313354, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8688284158706665, + "num_tokens": 101591505.0, + "step": 2789 + }, + { + "epoch": 0.5181058495821727, + "grad_norm": 1.4932944774627686, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.878896176815033, + "num_tokens": 101627874.0, + "step": 2790 + }, + { + "epoch": 0.5182915506035283, + "grad_norm": 1.6942373514175415, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.866915225982666, + "num_tokens": 101657774.0, + "step": 2791 + }, + { + "epoch": 0.5184772516248839, + "grad_norm": 1.6725281476974487, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.86109858751297, + "num_tokens": 101692547.0, + "step": 2792 + }, + { + "epoch": 0.5186629526462395, + "grad_norm": 1.5008467435836792, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8731257915496826, + "num_tokens": 101730539.0, + "step": 2793 + }, + { + "epoch": 0.5188486536675951, + "grad_norm": 1.6297845840454102, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8476008176803589, + "num_tokens": 101765437.0, + "step": 2794 + }, + { + "epoch": 0.5190343546889508, + "grad_norm": 1.5471924543380737, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8574293851852417, + "num_tokens": 101802754.0, + "step": 2795 + }, + { + "epoch": 0.5192200557103064, + "grad_norm": 1.4643391370773315, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8718045949935913, + "num_tokens": 101842486.0, + "step": 2796 + }, + { + "epoch": 0.519405756731662, + "grad_norm": 1.5395363569259644, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8612942099571228, + "num_tokens": 101878254.0, + "step": 2797 + }, + { + "epoch": 0.5195914577530176, + "grad_norm": 1.5562840700149536, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8652390241622925, + "num_tokens": 101912585.0, + "step": 2798 + }, + { + "epoch": 0.5197771587743732, + "grad_norm": 1.5381985902786255, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8777376413345337, + "num_tokens": 101950651.0, + "step": 2799 + }, + { + "epoch": 0.5199628597957289, + "grad_norm": 1.598483681678772, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8679710626602173, + "num_tokens": 101988201.0, + "step": 2800 + }, + { + "epoch": 0.5201485608170845, + "grad_norm": 1.624598741531372, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8547279834747314, + "num_tokens": 102023572.0, + "step": 2801 + }, + { + "epoch": 0.5203342618384401, + "grad_norm": 1.8026291131973267, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8635685443878174, + "num_tokens": 102054409.0, + "step": 2802 + }, + { + "epoch": 0.5205199628597957, + "grad_norm": 1.4703161716461182, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8577553033828735, + "num_tokens": 102098015.0, + "step": 2803 + }, + { + "epoch": 0.5207056638811514, + "grad_norm": 1.5997627973556519, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8547704219818115, + "num_tokens": 102139112.0, + "step": 2804 + }, + { + "epoch": 0.520891364902507, + "grad_norm": 1.5469366312026978, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8632710576057434, + "num_tokens": 102176652.0, + "step": 2805 + }, + { + "epoch": 0.5210770659238626, + "grad_norm": 1.648364782333374, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8469103574752808, + "num_tokens": 102214344.0, + "step": 2806 + }, + { + "epoch": 0.5212627669452182, + "grad_norm": 1.5486942529678345, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8636934757232666, + "num_tokens": 102250802.0, + "step": 2807 + }, + { + "epoch": 0.5214484679665738, + "grad_norm": 1.6041234731674194, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8509527444839478, + "num_tokens": 102288686.0, + "step": 2808 + }, + { + "epoch": 0.5216341689879295, + "grad_norm": 1.5042253732681274, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8704270124435425, + "num_tokens": 102328562.0, + "step": 2809 + }, + { + "epoch": 0.5218198700092851, + "grad_norm": 1.6771752834320068, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.855944037437439, + "num_tokens": 102363558.0, + "step": 2810 + }, + { + "epoch": 0.5220055710306407, + "grad_norm": 1.7247886657714844, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8550934791564941, + "num_tokens": 102399305.0, + "step": 2811 + }, + { + "epoch": 0.5221912720519963, + "grad_norm": 1.4397135972976685, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8729695677757263, + "num_tokens": 102436651.0, + "step": 2812 + }, + { + "epoch": 0.522376973073352, + "grad_norm": 1.5024343729019165, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.85398268699646, + "num_tokens": 102476786.0, + "step": 2813 + }, + { + "epoch": 0.5225626740947075, + "grad_norm": 1.5608291625976562, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.860966682434082, + "num_tokens": 102515797.0, + "step": 2814 + }, + { + "epoch": 0.5227483751160631, + "grad_norm": 1.5293173789978027, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8630654215812683, + "num_tokens": 102554575.0, + "step": 2815 + }, + { + "epoch": 0.5229340761374187, + "grad_norm": 1.495579481124878, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8673739433288574, + "num_tokens": 102594803.0, + "step": 2816 + }, + { + "epoch": 0.5231197771587743, + "grad_norm": 1.6582006216049194, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8601954579353333, + "num_tokens": 102628574.0, + "step": 2817 + }, + { + "epoch": 0.52330547818013, + "grad_norm": 1.5424392223358154, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8774673938751221, + "num_tokens": 102660988.0, + "step": 2818 + }, + { + "epoch": 0.5234911792014856, + "grad_norm": 1.4902337789535522, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8570356369018555, + "num_tokens": 102700659.0, + "step": 2819 + }, + { + "epoch": 0.5236768802228412, + "grad_norm": 1.561714768409729, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8582649230957031, + "num_tokens": 102738842.0, + "step": 2820 + }, + { + "epoch": 0.5238625812441968, + "grad_norm": 1.6964097023010254, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8549506664276123, + "num_tokens": 102772517.0, + "step": 2821 + }, + { + "epoch": 0.5240482822655524, + "grad_norm": 1.4774422645568848, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8722161054611206, + "num_tokens": 102810625.0, + "step": 2822 + }, + { + "epoch": 0.5242339832869081, + "grad_norm": 1.6051537990570068, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8551872968673706, + "num_tokens": 102846055.0, + "step": 2823 + }, + { + "epoch": 0.5244196843082637, + "grad_norm": 1.4833647012710571, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8546460270881653, + "num_tokens": 102885957.0, + "step": 2824 + }, + { + "epoch": 0.5246053853296193, + "grad_norm": 1.491105556488037, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8556119203567505, + "num_tokens": 102927357.0, + "step": 2825 + }, + { + "epoch": 0.5247910863509749, + "grad_norm": 1.5069197416305542, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8748733997344971, + "num_tokens": 102962057.0, + "step": 2826 + }, + { + "epoch": 0.5249767873723306, + "grad_norm": 1.5597723722457886, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8349138498306274, + "num_tokens": 103002430.0, + "step": 2827 + }, + { + "epoch": 0.5251624883936862, + "grad_norm": 1.6914105415344238, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8579855561256409, + "num_tokens": 103033765.0, + "step": 2828 + }, + { + "epoch": 0.5253481894150418, + "grad_norm": 1.3935415744781494, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.879694938659668, + "num_tokens": 103075603.0, + "step": 2829 + }, + { + "epoch": 0.5255338904363974, + "grad_norm": 1.51462984085083, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.867989182472229, + "num_tokens": 103110874.0, + "step": 2830 + }, + { + "epoch": 0.525719591457753, + "grad_norm": 1.5081652402877808, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.857807993888855, + "num_tokens": 103149997.0, + "step": 2831 + }, + { + "epoch": 0.5259052924791087, + "grad_norm": 1.5243043899536133, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8586049675941467, + "num_tokens": 103185418.0, + "step": 2832 + }, + { + "epoch": 0.5260909935004643, + "grad_norm": 1.6293426752090454, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8619405627250671, + "num_tokens": 103220355.0, + "step": 2833 + }, + { + "epoch": 0.5262766945218199, + "grad_norm": 1.557559609413147, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8617530465126038, + "num_tokens": 103261227.0, + "step": 2834 + }, + { + "epoch": 0.5264623955431755, + "grad_norm": 1.7890024185180664, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8647781014442444, + "num_tokens": 103288101.0, + "step": 2835 + }, + { + "epoch": 0.5266480965645312, + "grad_norm": 1.6819335222244263, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8511234521865845, + "num_tokens": 103319623.0, + "step": 2836 + }, + { + "epoch": 0.5268337975858868, + "grad_norm": 1.517689824104309, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8696885108947754, + "num_tokens": 103356624.0, + "step": 2837 + }, + { + "epoch": 0.5270194986072423, + "grad_norm": 1.6248153448104858, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8638684749603271, + "num_tokens": 103388981.0, + "step": 2838 + }, + { + "epoch": 0.5272051996285979, + "grad_norm": 1.5322355031967163, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8593111038208008, + "num_tokens": 103426586.0, + "step": 2839 + }, + { + "epoch": 0.5273909006499535, + "grad_norm": 1.5923527479171753, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8707654476165771, + "num_tokens": 103463091.0, + "step": 2840 + }, + { + "epoch": 0.5275766016713092, + "grad_norm": 1.541510820388794, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8645154237747192, + "num_tokens": 103502153.0, + "step": 2841 + }, + { + "epoch": 0.5277623026926648, + "grad_norm": 1.7142908573150635, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8418346047401428, + "num_tokens": 103539407.0, + "step": 2842 + }, + { + "epoch": 0.5279480037140204, + "grad_norm": 1.5152277946472168, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8652316331863403, + "num_tokens": 103579933.0, + "step": 2843 + }, + { + "epoch": 0.528133704735376, + "grad_norm": 1.420615315437317, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.858254075050354, + "num_tokens": 103623636.0, + "step": 2844 + }, + { + "epoch": 0.5283194057567316, + "grad_norm": 1.5670607089996338, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8637462854385376, + "num_tokens": 103658687.0, + "step": 2845 + }, + { + "epoch": 0.5285051067780873, + "grad_norm": 1.409656047821045, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8705792427062988, + "num_tokens": 103701806.0, + "step": 2846 + }, + { + "epoch": 0.5286908077994429, + "grad_norm": 1.4938185214996338, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8602966666221619, + "num_tokens": 103740325.0, + "step": 2847 + }, + { + "epoch": 0.5288765088207985, + "grad_norm": 1.6690624952316284, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8716833591461182, + "num_tokens": 103773671.0, + "step": 2848 + }, + { + "epoch": 0.5290622098421541, + "grad_norm": 1.555891752243042, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8471505641937256, + "num_tokens": 103811933.0, + "step": 2849 + }, + { + "epoch": 0.5292479108635098, + "grad_norm": 1.5320724248886108, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8611098527908325, + "num_tokens": 103849805.0, + "step": 2850 + }, + { + "epoch": 0.5294336118848654, + "grad_norm": 1.4816067218780518, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8890973329544067, + "num_tokens": 103884970.0, + "step": 2851 + }, + { + "epoch": 0.529619312906221, + "grad_norm": 1.6170226335525513, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8687093257904053, + "num_tokens": 103920684.0, + "step": 2852 + }, + { + "epoch": 0.5298050139275766, + "grad_norm": 1.696804165840149, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8464212417602539, + "num_tokens": 103951625.0, + "step": 2853 + }, + { + "epoch": 0.5299907149489322, + "grad_norm": 1.5976310968399048, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8537710905075073, + "num_tokens": 103987887.0, + "step": 2854 + }, + { + "epoch": 0.5301764159702879, + "grad_norm": 1.517220139503479, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8673912882804871, + "num_tokens": 104024974.0, + "step": 2855 + }, + { + "epoch": 0.5303621169916435, + "grad_norm": 1.6577274799346924, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8540701270103455, + "num_tokens": 104060518.0, + "step": 2856 + }, + { + "epoch": 0.5305478180129991, + "grad_norm": 1.5161850452423096, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8705898523330688, + "num_tokens": 104097973.0, + "step": 2857 + }, + { + "epoch": 0.5307335190343547, + "grad_norm": 1.5331584215164185, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8575494289398193, + "num_tokens": 104138636.0, + "step": 2858 + }, + { + "epoch": 0.5309192200557104, + "grad_norm": 1.4830005168914795, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8766937851905823, + "num_tokens": 104176707.0, + "step": 2859 + }, + { + "epoch": 0.531104921077066, + "grad_norm": 1.5256547927856445, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8646847605705261, + "num_tokens": 104220236.0, + "step": 2860 + }, + { + "epoch": 0.5312906220984216, + "grad_norm": 1.4540234804153442, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8658458590507507, + "num_tokens": 104259874.0, + "step": 2861 + }, + { + "epoch": 0.5314763231197771, + "grad_norm": 1.7978006601333618, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8607527613639832, + "num_tokens": 104295781.0, + "step": 2862 + }, + { + "epoch": 0.5316620241411327, + "grad_norm": 1.598503828048706, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.862335741519928, + "num_tokens": 104327761.0, + "step": 2863 + }, + { + "epoch": 0.5318477251624883, + "grad_norm": 1.530352234840393, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8631314635276794, + "num_tokens": 104364316.0, + "step": 2864 + }, + { + "epoch": 0.532033426183844, + "grad_norm": 1.5216845273971558, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8666436076164246, + "num_tokens": 104400385.0, + "step": 2865 + }, + { + "epoch": 0.5322191272051996, + "grad_norm": 1.4488662481307983, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8692547082901001, + "num_tokens": 104439757.0, + "step": 2866 + }, + { + "epoch": 0.5324048282265552, + "grad_norm": 1.5604794025421143, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8661853671073914, + "num_tokens": 104473400.0, + "step": 2867 + }, + { + "epoch": 0.5325905292479108, + "grad_norm": 1.5349992513656616, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8646402359008789, + "num_tokens": 104509150.0, + "step": 2868 + }, + { + "epoch": 0.5327762302692665, + "grad_norm": 1.5042051076889038, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8612176179885864, + "num_tokens": 104544919.0, + "step": 2869 + }, + { + "epoch": 0.5329619312906221, + "grad_norm": 1.5413075685501099, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8696022629737854, + "num_tokens": 104578587.0, + "step": 2870 + }, + { + "epoch": 0.5331476323119777, + "grad_norm": 1.5227274894714355, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8652209639549255, + "num_tokens": 104616166.0, + "step": 2871 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.6277387142181396, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8706022500991821, + "num_tokens": 104650852.0, + "step": 2872 + }, + { + "epoch": 0.533519034354689, + "grad_norm": 1.5316863059997559, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8551070094108582, + "num_tokens": 104691563.0, + "step": 2873 + }, + { + "epoch": 0.5337047353760446, + "grad_norm": 1.5917706489562988, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8690463900566101, + "num_tokens": 104724278.0, + "step": 2874 + }, + { + "epoch": 0.5338904363974002, + "grad_norm": 1.5483871698379517, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.869680643081665, + "num_tokens": 104761750.0, + "step": 2875 + }, + { + "epoch": 0.5340761374187558, + "grad_norm": 1.6246949434280396, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8678984642028809, + "num_tokens": 104791059.0, + "step": 2876 + }, + { + "epoch": 0.5342618384401114, + "grad_norm": 1.619994044303894, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8611623644828796, + "num_tokens": 104822731.0, + "step": 2877 + }, + { + "epoch": 0.5344475394614671, + "grad_norm": 1.805242896080017, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8692331314086914, + "num_tokens": 104854534.0, + "step": 2878 + }, + { + "epoch": 0.5346332404828227, + "grad_norm": 1.4745734930038452, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8582967519760132, + "num_tokens": 104896048.0, + "step": 2879 + }, + { + "epoch": 0.5348189415041783, + "grad_norm": 1.514650821685791, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8615310192108154, + "num_tokens": 104935735.0, + "step": 2880 + }, + { + "epoch": 0.5350046425255339, + "grad_norm": 1.544297456741333, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8709221482276917, + "num_tokens": 104969496.0, + "step": 2881 + }, + { + "epoch": 0.5351903435468895, + "grad_norm": 1.5739983320236206, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8553961515426636, + "num_tokens": 105006809.0, + "step": 2882 + }, + { + "epoch": 0.5353760445682452, + "grad_norm": 1.619816541671753, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8637264966964722, + "num_tokens": 105037729.0, + "step": 2883 + }, + { + "epoch": 0.5355617455896008, + "grad_norm": 1.6348613500595093, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8691787719726562, + "num_tokens": 105069176.0, + "step": 2884 + }, + { + "epoch": 0.5357474466109564, + "grad_norm": 1.5093306303024292, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8548509478569031, + "num_tokens": 105108301.0, + "step": 2885 + }, + { + "epoch": 0.5359331476323119, + "grad_norm": 1.5443767309188843, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8619071245193481, + "num_tokens": 105141490.0, + "step": 2886 + }, + { + "epoch": 0.5361188486536675, + "grad_norm": 1.5047520399093628, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8585057258605957, + "num_tokens": 105180928.0, + "step": 2887 + }, + { + "epoch": 0.5363045496750232, + "grad_norm": 1.5146139860153198, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8569825887680054, + "num_tokens": 105221214.0, + "step": 2888 + }, + { + "epoch": 0.5364902506963788, + "grad_norm": 1.708269715309143, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8503422737121582, + "num_tokens": 105254930.0, + "step": 2889 + }, + { + "epoch": 0.5366759517177344, + "grad_norm": 1.5069645643234253, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8548133373260498, + "num_tokens": 105293104.0, + "step": 2890 + }, + { + "epoch": 0.53686165273909, + "grad_norm": 1.5674437284469604, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8701038360595703, + "num_tokens": 105325440.0, + "step": 2891 + }, + { + "epoch": 0.5370473537604457, + "grad_norm": 1.4774311780929565, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8719780445098877, + "num_tokens": 105363770.0, + "step": 2892 + }, + { + "epoch": 0.5372330547818013, + "grad_norm": 1.5022739171981812, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8742059469223022, + "num_tokens": 105400240.0, + "step": 2893 + }, + { + "epoch": 0.5374187558031569, + "grad_norm": 1.5013294219970703, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8677637577056885, + "num_tokens": 105437723.0, + "step": 2894 + }, + { + "epoch": 0.5376044568245125, + "grad_norm": 1.7398054599761963, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8601657152175903, + "num_tokens": 105466405.0, + "step": 2895 + }, + { + "epoch": 0.5377901578458681, + "grad_norm": 1.5978504419326782, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.854816198348999, + "num_tokens": 105501408.0, + "step": 2896 + }, + { + "epoch": 0.5379758588672238, + "grad_norm": 1.4393439292907715, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8716444969177246, + "num_tokens": 105541515.0, + "step": 2897 + }, + { + "epoch": 0.5381615598885794, + "grad_norm": 1.7273439168930054, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8466842174530029, + "num_tokens": 105573721.0, + "step": 2898 + }, + { + "epoch": 0.538347260909935, + "grad_norm": 1.5592559576034546, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8639808893203735, + "num_tokens": 105606302.0, + "step": 2899 + }, + { + "epoch": 0.5385329619312906, + "grad_norm": 1.5891510248184204, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8685223460197449, + "num_tokens": 105642765.0, + "step": 2900 + }, + { + "epoch": 0.5387186629526463, + "grad_norm": 1.6668167114257812, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8491898775100708, + "num_tokens": 105676928.0, + "step": 2901 + }, + { + "epoch": 0.5389043639740019, + "grad_norm": 1.4974182844161987, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8689415454864502, + "num_tokens": 105715447.0, + "step": 2902 + }, + { + "epoch": 0.5390900649953575, + "grad_norm": 1.5071873664855957, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8592824339866638, + "num_tokens": 105754680.0, + "step": 2903 + }, + { + "epoch": 0.5392757660167131, + "grad_norm": 1.5010231733322144, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.867661714553833, + "num_tokens": 105791452.0, + "step": 2904 + }, + { + "epoch": 0.5394614670380687, + "grad_norm": 1.5379247665405273, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8819535970687866, + "num_tokens": 105826660.0, + "step": 2905 + }, + { + "epoch": 0.5396471680594244, + "grad_norm": 1.5812913179397583, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8539522886276245, + "num_tokens": 105863539.0, + "step": 2906 + }, + { + "epoch": 0.53983286908078, + "grad_norm": 1.5172252655029297, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.862006425857544, + "num_tokens": 105900413.0, + "step": 2907 + }, + { + "epoch": 0.5400185701021356, + "grad_norm": 1.5011135339736938, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8648180961608887, + "num_tokens": 105937188.0, + "step": 2908 + }, + { + "epoch": 0.5402042711234912, + "grad_norm": 1.4563380479812622, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8641961812973022, + "num_tokens": 105980742.0, + "step": 2909 + }, + { + "epoch": 0.5403899721448467, + "grad_norm": 1.5308120250701904, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8623546361923218, + "num_tokens": 106020153.0, + "step": 2910 + }, + { + "epoch": 0.5405756731662024, + "grad_norm": 1.553324580192566, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8777053356170654, + "num_tokens": 106057727.0, + "step": 2911 + }, + { + "epoch": 0.540761374187558, + "grad_norm": 1.5553114414215088, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8746246695518494, + "num_tokens": 106094667.0, + "step": 2912 + }, + { + "epoch": 0.5409470752089136, + "grad_norm": 1.4258546829223633, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8620801568031311, + "num_tokens": 106136174.0, + "step": 2913 + }, + { + "epoch": 0.5411327762302692, + "grad_norm": 1.5716867446899414, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.852705717086792, + "num_tokens": 106174601.0, + "step": 2914 + }, + { + "epoch": 0.5413184772516249, + "grad_norm": 1.5480546951293945, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8626123666763306, + "num_tokens": 106209964.0, + "step": 2915 + }, + { + "epoch": 0.5415041782729805, + "grad_norm": 1.3976593017578125, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8733127117156982, + "num_tokens": 106251116.0, + "step": 2916 + }, + { + "epoch": 0.5416898792943361, + "grad_norm": 1.3297415971755981, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.870103120803833, + "num_tokens": 106291839.0, + "step": 2917 + }, + { + "epoch": 0.5418755803156917, + "grad_norm": 1.6611217260360718, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.858329713344574, + "num_tokens": 106324112.0, + "step": 2918 + }, + { + "epoch": 0.5420612813370473, + "grad_norm": 1.5928013324737549, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8644636869430542, + "num_tokens": 106359312.0, + "step": 2919 + }, + { + "epoch": 0.542246982358403, + "grad_norm": 1.6355267763137817, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8547118902206421, + "num_tokens": 106396079.0, + "step": 2920 + }, + { + "epoch": 0.5424326833797586, + "grad_norm": 1.543412446975708, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8660951256752014, + "num_tokens": 106432078.0, + "step": 2921 + }, + { + "epoch": 0.5426183844011142, + "grad_norm": 1.4048322439193726, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8563535213470459, + "num_tokens": 106474345.0, + "step": 2922 + }, + { + "epoch": 0.5428040854224698, + "grad_norm": 1.5647428035736084, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8541587591171265, + "num_tokens": 106509107.0, + "step": 2923 + }, + { + "epoch": 0.5429897864438255, + "grad_norm": 1.6205936670303345, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8650688529014587, + "num_tokens": 106541383.0, + "step": 2924 + }, + { + "epoch": 0.5431754874651811, + "grad_norm": 1.571266531944275, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8739361763000488, + "num_tokens": 106579223.0, + "step": 2925 + }, + { + "epoch": 0.5433611884865367, + "grad_norm": 1.7448831796646118, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.852306067943573, + "num_tokens": 106613260.0, + "step": 2926 + }, + { + "epoch": 0.5435468895078923, + "grad_norm": 1.711601972579956, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8616779446601868, + "num_tokens": 106642969.0, + "step": 2927 + }, + { + "epoch": 0.5437325905292479, + "grad_norm": 1.5791354179382324, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8577788472175598, + "num_tokens": 106684817.0, + "step": 2928 + }, + { + "epoch": 0.5439182915506036, + "grad_norm": 1.5937652587890625, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8748511075973511, + "num_tokens": 106719094.0, + "step": 2929 + }, + { + "epoch": 0.5441039925719592, + "grad_norm": 1.6107113361358643, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8724899888038635, + "num_tokens": 106753102.0, + "step": 2930 + }, + { + "epoch": 0.5442896935933148, + "grad_norm": 1.4841738939285278, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8584010601043701, + "num_tokens": 106791747.0, + "step": 2931 + }, + { + "epoch": 0.5444753946146704, + "grad_norm": 1.5444843769073486, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8726680278778076, + "num_tokens": 106825473.0, + "step": 2932 + }, + { + "epoch": 0.544661095636026, + "grad_norm": 1.5376756191253662, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8740266561508179, + "num_tokens": 106858847.0, + "step": 2933 + }, + { + "epoch": 0.5448467966573816, + "grad_norm": 1.5775010585784912, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8727192878723145, + "num_tokens": 106892721.0, + "step": 2934 + }, + { + "epoch": 0.5450324976787372, + "grad_norm": 1.5080125331878662, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8552066683769226, + "num_tokens": 106928858.0, + "step": 2935 + }, + { + "epoch": 0.5452181987000928, + "grad_norm": 1.3973277807235718, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8683368563652039, + "num_tokens": 106969257.0, + "step": 2936 + }, + { + "epoch": 0.5454038997214484, + "grad_norm": 1.5707919597625732, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8569366931915283, + "num_tokens": 107002669.0, + "step": 2937 + }, + { + "epoch": 0.545589600742804, + "grad_norm": 1.49653160572052, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8641126155853271, + "num_tokens": 107041382.0, + "step": 2938 + }, + { + "epoch": 0.5457753017641597, + "grad_norm": 1.5005923509597778, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.86822909116745, + "num_tokens": 107079302.0, + "step": 2939 + }, + { + "epoch": 0.5459610027855153, + "grad_norm": 1.481977105140686, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8642546534538269, + "num_tokens": 107118134.0, + "step": 2940 + }, + { + "epoch": 0.5461467038068709, + "grad_norm": 1.4347455501556396, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8742492198944092, + "num_tokens": 107156602.0, + "step": 2941 + }, + { + "epoch": 0.5463324048282265, + "grad_norm": 1.5774116516113281, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8660250902175903, + "num_tokens": 107191421.0, + "step": 2942 + }, + { + "epoch": 0.5465181058495822, + "grad_norm": 1.5521941184997559, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8653683662414551, + "num_tokens": 107225959.0, + "step": 2943 + }, + { + "epoch": 0.5467038068709378, + "grad_norm": 1.5522342920303345, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8717246651649475, + "num_tokens": 107260073.0, + "step": 2944 + }, + { + "epoch": 0.5468895078922934, + "grad_norm": 1.6210147142410278, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.854822039604187, + "num_tokens": 107297185.0, + "step": 2945 + }, + { + "epoch": 0.547075208913649, + "grad_norm": 1.495621681213379, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.861075222492218, + "num_tokens": 107334879.0, + "step": 2946 + }, + { + "epoch": 0.5472609099350046, + "grad_norm": 1.5301704406738281, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.869431734085083, + "num_tokens": 107370244.0, + "step": 2947 + }, + { + "epoch": 0.5474466109563603, + "grad_norm": 1.676134467124939, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.875017523765564, + "num_tokens": 107399600.0, + "step": 2948 + }, + { + "epoch": 0.5476323119777159, + "grad_norm": 1.5095058679580688, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8711168169975281, + "num_tokens": 107435578.0, + "step": 2949 + }, + { + "epoch": 0.5478180129990715, + "grad_norm": 1.4690861701965332, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8763861656188965, + "num_tokens": 107474019.0, + "step": 2950 + }, + { + "epoch": 0.5480037140204271, + "grad_norm": 1.4446886777877808, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8812880516052246, + "num_tokens": 107512864.0, + "step": 2951 + }, + { + "epoch": 0.5481894150417828, + "grad_norm": 1.776757836341858, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8494101762771606, + "num_tokens": 107545173.0, + "step": 2952 + }, + { + "epoch": 0.5483751160631384, + "grad_norm": 1.6027339696884155, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8541586995124817, + "num_tokens": 107578878.0, + "step": 2953 + }, + { + "epoch": 0.548560817084494, + "grad_norm": 1.4712700843811035, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8798344135284424, + "num_tokens": 107614089.0, + "step": 2954 + }, + { + "epoch": 0.5487465181058496, + "grad_norm": 1.540719747543335, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8546996116638184, + "num_tokens": 107650849.0, + "step": 2955 + }, + { + "epoch": 0.5489322191272052, + "grad_norm": 1.4739373922348022, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8697251081466675, + "num_tokens": 107688643.0, + "step": 2956 + }, + { + "epoch": 0.5491179201485609, + "grad_norm": 1.6070085763931274, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8637440204620361, + "num_tokens": 107724957.0, + "step": 2957 + }, + { + "epoch": 0.5493036211699165, + "grad_norm": 1.6277823448181152, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8699357509613037, + "num_tokens": 107759471.0, + "step": 2958 + }, + { + "epoch": 0.549489322191272, + "grad_norm": 1.5328553915023804, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8646177649497986, + "num_tokens": 107794710.0, + "step": 2959 + }, + { + "epoch": 0.5496750232126276, + "grad_norm": 1.441807746887207, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8626049757003784, + "num_tokens": 107832365.0, + "step": 2960 + }, + { + "epoch": 0.5498607242339832, + "grad_norm": 1.7074007987976074, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8479942083358765, + "num_tokens": 107866137.0, + "step": 2961 + }, + { + "epoch": 0.5500464252553389, + "grad_norm": 1.4619899988174438, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8577780723571777, + "num_tokens": 107907653.0, + "step": 2962 + }, + { + "epoch": 0.5502321262766945, + "grad_norm": 1.4617919921875, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8680464625358582, + "num_tokens": 107947646.0, + "step": 2963 + }, + { + "epoch": 0.5504178272980501, + "grad_norm": 1.7462977170944214, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8520942330360413, + "num_tokens": 107985864.0, + "step": 2964 + }, + { + "epoch": 0.5506035283194057, + "grad_norm": 1.5136407613754272, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8660179376602173, + "num_tokens": 108022529.0, + "step": 2965 + }, + { + "epoch": 0.5507892293407614, + "grad_norm": 1.528204083442688, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8715853691101074, + "num_tokens": 108062076.0, + "step": 2966 + }, + { + "epoch": 0.550974930362117, + "grad_norm": 1.7664204835891724, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.862383246421814, + "num_tokens": 108092045.0, + "step": 2967 + }, + { + "epoch": 0.5511606313834726, + "grad_norm": 1.560204029083252, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8657220005989075, + "num_tokens": 108126587.0, + "step": 2968 + }, + { + "epoch": 0.5513463324048282, + "grad_norm": 1.4459354877471924, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8727400898933411, + "num_tokens": 108164277.0, + "step": 2969 + }, + { + "epoch": 0.5515320334261838, + "grad_norm": 1.572561502456665, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8603198528289795, + "num_tokens": 108203526.0, + "step": 2970 + }, + { + "epoch": 0.5517177344475395, + "grad_norm": 1.492164134979248, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8751587867736816, + "num_tokens": 108241529.0, + "step": 2971 + }, + { + "epoch": 0.5519034354688951, + "grad_norm": 1.470663070678711, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.864660382270813, + "num_tokens": 108281550.0, + "step": 2972 + }, + { + "epoch": 0.5520891364902507, + "grad_norm": 1.6286687850952148, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8606488108634949, + "num_tokens": 108315990.0, + "step": 2973 + }, + { + "epoch": 0.5522748375116063, + "grad_norm": 1.4409456253051758, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.875247597694397, + "num_tokens": 108354024.0, + "step": 2974 + }, + { + "epoch": 0.552460538532962, + "grad_norm": 1.5008071660995483, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8696630001068115, + "num_tokens": 108393463.0, + "step": 2975 + }, + { + "epoch": 0.5526462395543176, + "grad_norm": 1.5867373943328857, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8531962037086487, + "num_tokens": 108430950.0, + "step": 2976 + }, + { + "epoch": 0.5528319405756732, + "grad_norm": 1.6312763690948486, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8654693365097046, + "num_tokens": 108465065.0, + "step": 2977 + }, + { + "epoch": 0.5530176415970288, + "grad_norm": 1.6778451204299927, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8617575168609619, + "num_tokens": 108498189.0, + "step": 2978 + }, + { + "epoch": 0.5532033426183844, + "grad_norm": 1.5739200115203857, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8688313961029053, + "num_tokens": 108534715.0, + "step": 2979 + }, + { + "epoch": 0.5533890436397401, + "grad_norm": 1.517701268196106, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8751313090324402, + "num_tokens": 108572059.0, + "step": 2980 + }, + { + "epoch": 0.5535747446610957, + "grad_norm": 1.4814265966415405, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8607461452484131, + "num_tokens": 108611612.0, + "step": 2981 + }, + { + "epoch": 0.5537604456824513, + "grad_norm": 1.4924941062927246, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8639264106750488, + "num_tokens": 108648295.0, + "step": 2982 + }, + { + "epoch": 0.5539461467038068, + "grad_norm": 1.469472050666809, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8837808966636658, + "num_tokens": 108683389.0, + "step": 2983 + }, + { + "epoch": 0.5541318477251624, + "grad_norm": 1.5032124519348145, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8538260459899902, + "num_tokens": 108723743.0, + "step": 2984 + }, + { + "epoch": 0.5543175487465181, + "grad_norm": 1.6923950910568237, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.867501974105835, + "num_tokens": 108753604.0, + "step": 2985 + }, + { + "epoch": 0.5545032497678737, + "grad_norm": 1.511138677597046, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8602910041809082, + "num_tokens": 108792329.0, + "step": 2986 + }, + { + "epoch": 0.5546889507892293, + "grad_norm": 1.750275731086731, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.854928195476532, + "num_tokens": 108824352.0, + "step": 2987 + }, + { + "epoch": 0.5548746518105849, + "grad_norm": 1.5189766883850098, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8802462816238403, + "num_tokens": 108858595.0, + "step": 2988 + }, + { + "epoch": 0.5550603528319406, + "grad_norm": 1.5051859617233276, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8727601766586304, + "num_tokens": 108896435.0, + "step": 2989 + }, + { + "epoch": 0.5552460538532962, + "grad_norm": 1.6048215627670288, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8454056978225708, + "num_tokens": 108932859.0, + "step": 2990 + }, + { + "epoch": 0.5554317548746518, + "grad_norm": 1.5719971656799316, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8667168021202087, + "num_tokens": 108968782.0, + "step": 2991 + }, + { + "epoch": 0.5556174558960074, + "grad_norm": 1.6143054962158203, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8671187162399292, + "num_tokens": 109002943.0, + "step": 2992 + }, + { + "epoch": 0.555803156917363, + "grad_norm": 1.5146788358688354, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.870141863822937, + "num_tokens": 109040873.0, + "step": 2993 + }, + { + "epoch": 0.5559888579387187, + "grad_norm": 1.5257794857025146, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8640537858009338, + "num_tokens": 109081047.0, + "step": 2994 + }, + { + "epoch": 0.5561745589600743, + "grad_norm": 1.4475338459014893, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8741889595985413, + "num_tokens": 109121742.0, + "step": 2995 + }, + { + "epoch": 0.5563602599814299, + "grad_norm": 1.5529371500015259, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8732814192771912, + "num_tokens": 109157124.0, + "step": 2996 + }, + { + "epoch": 0.5565459610027855, + "grad_norm": 1.5241070985794067, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8536012172698975, + "num_tokens": 109196890.0, + "step": 2997 + }, + { + "epoch": 0.5567316620241411, + "grad_norm": 1.4308046102523804, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8779659271240234, + "num_tokens": 109236017.0, + "step": 2998 + }, + { + "epoch": 0.5569173630454968, + "grad_norm": 1.5577272176742554, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8856710195541382, + "num_tokens": 109265922.0, + "step": 2999 + }, + { + "epoch": 0.5571030640668524, + "grad_norm": 1.6224886178970337, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8482134938240051, + "num_tokens": 109300937.0, + "step": 3000 + }, + { + "epoch": 0.557288765088208, + "grad_norm": 1.5424028635025024, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8780581951141357, + "num_tokens": 109336623.0, + "step": 3001 + }, + { + "epoch": 0.5574744661095636, + "grad_norm": 1.5139323472976685, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8698512315750122, + "num_tokens": 109370790.0, + "step": 3002 + }, + { + "epoch": 0.5576601671309193, + "grad_norm": 1.375964879989624, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8647566437721252, + "num_tokens": 109413297.0, + "step": 3003 + }, + { + "epoch": 0.5578458681522749, + "grad_norm": 1.6127785444259644, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8679944276809692, + "num_tokens": 109447296.0, + "step": 3004 + }, + { + "epoch": 0.5580315691736305, + "grad_norm": 1.491615891456604, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8609997034072876, + "num_tokens": 109488131.0, + "step": 3005 + }, + { + "epoch": 0.5582172701949861, + "grad_norm": 1.569811224937439, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8536180257797241, + "num_tokens": 109524481.0, + "step": 3006 + }, + { + "epoch": 0.5584029712163416, + "grad_norm": 1.8119282722473145, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.863777756690979, + "num_tokens": 109553941.0, + "step": 3007 + }, + { + "epoch": 0.5585886722376973, + "grad_norm": 1.5609632730484009, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.865986704826355, + "num_tokens": 109591002.0, + "step": 3008 + }, + { + "epoch": 0.5587743732590529, + "grad_norm": 1.628398060798645, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8614816665649414, + "num_tokens": 109626045.0, + "step": 3009 + }, + { + "epoch": 0.5589600742804085, + "grad_norm": 1.6221652030944824, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.848737359046936, + "num_tokens": 109663826.0, + "step": 3010 + }, + { + "epoch": 0.5591457753017641, + "grad_norm": 1.5423401594161987, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8666011691093445, + "num_tokens": 109698329.0, + "step": 3011 + }, + { + "epoch": 0.5593314763231197, + "grad_norm": 1.6061428785324097, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8604741096496582, + "num_tokens": 109732435.0, + "step": 3012 + }, + { + "epoch": 0.5595171773444754, + "grad_norm": 1.6520252227783203, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8616888523101807, + "num_tokens": 109768563.0, + "step": 3013 + }, + { + "epoch": 0.559702878365831, + "grad_norm": 1.5506377220153809, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8784862756729126, + "num_tokens": 109803976.0, + "step": 3014 + }, + { + "epoch": 0.5598885793871866, + "grad_norm": 1.5002104043960571, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8609732389450073, + "num_tokens": 109841503.0, + "step": 3015 + }, + { + "epoch": 0.5600742804085422, + "grad_norm": 1.5959725379943848, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8741214275360107, + "num_tokens": 109875392.0, + "step": 3016 + }, + { + "epoch": 0.5602599814298979, + "grad_norm": 1.4892849922180176, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8731909990310669, + "num_tokens": 109911919.0, + "step": 3017 + }, + { + "epoch": 0.5604456824512535, + "grad_norm": 1.575556755065918, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8637661337852478, + "num_tokens": 109947985.0, + "step": 3018 + }, + { + "epoch": 0.5606313834726091, + "grad_norm": 1.5682802200317383, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8761608600616455, + "num_tokens": 109981331.0, + "step": 3019 + }, + { + "epoch": 0.5608170844939647, + "grad_norm": 1.5037431716918945, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8787715435028076, + "num_tokens": 110016209.0, + "step": 3020 + }, + { + "epoch": 0.5610027855153203, + "grad_norm": 1.485684871673584, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8680707216262817, + "num_tokens": 110054083.0, + "step": 3021 + }, + { + "epoch": 0.561188486536676, + "grad_norm": 1.5797744989395142, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8680829405784607, + "num_tokens": 110092067.0, + "step": 3022 + }, + { + "epoch": 0.5613741875580316, + "grad_norm": 1.7614996433258057, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8569867014884949, + "num_tokens": 110125071.0, + "step": 3023 + }, + { + "epoch": 0.5615598885793872, + "grad_norm": 1.6500827074050903, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8665968179702759, + "num_tokens": 110157655.0, + "step": 3024 + }, + { + "epoch": 0.5617455896007428, + "grad_norm": 1.634912371635437, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8715775609016418, + "num_tokens": 110188779.0, + "step": 3025 + }, + { + "epoch": 0.5619312906220985, + "grad_norm": 1.4563323259353638, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.862338125705719, + "num_tokens": 110230589.0, + "step": 3026 + }, + { + "epoch": 0.5621169916434541, + "grad_norm": 1.5636329650878906, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8661342859268188, + "num_tokens": 110267448.0, + "step": 3027 + }, + { + "epoch": 0.5623026926648097, + "grad_norm": 1.5414738655090332, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8774381279945374, + "num_tokens": 110304191.0, + "step": 3028 + }, + { + "epoch": 0.5624883936861653, + "grad_norm": 1.461563229560852, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8626044392585754, + "num_tokens": 110344622.0, + "step": 3029 + }, + { + "epoch": 0.5626740947075209, + "grad_norm": 1.5670915842056274, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8681955933570862, + "num_tokens": 110382501.0, + "step": 3030 + }, + { + "epoch": 0.5628597957288765, + "grad_norm": 1.5326619148254395, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8658303022384644, + "num_tokens": 110421283.0, + "step": 3031 + }, + { + "epoch": 0.5630454967502321, + "grad_norm": 1.4926055669784546, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8592041730880737, + "num_tokens": 110459286.0, + "step": 3032 + }, + { + "epoch": 0.5632311977715877, + "grad_norm": 1.4347692728042603, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8676581382751465, + "num_tokens": 110497561.0, + "step": 3033 + }, + { + "epoch": 0.5634168987929433, + "grad_norm": 1.5002751350402832, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8654970526695251, + "num_tokens": 110533305.0, + "step": 3034 + }, + { + "epoch": 0.5636025998142989, + "grad_norm": 1.6221908330917358, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8629559874534607, + "num_tokens": 110566865.0, + "step": 3035 + }, + { + "epoch": 0.5637883008356546, + "grad_norm": 1.5728731155395508, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8540099859237671, + "num_tokens": 110601977.0, + "step": 3036 + }, + { + "epoch": 0.5639740018570102, + "grad_norm": 1.637015700340271, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8625157475471497, + "num_tokens": 110636822.0, + "step": 3037 + }, + { + "epoch": 0.5641597028783658, + "grad_norm": 1.466058611869812, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8832035064697266, + "num_tokens": 110673279.0, + "step": 3038 + }, + { + "epoch": 0.5643454038997214, + "grad_norm": 1.4150187969207764, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8851606845855713, + "num_tokens": 110711491.0, + "step": 3039 + }, + { + "epoch": 0.564531104921077, + "grad_norm": 1.4840428829193115, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8584755659103394, + "num_tokens": 110748756.0, + "step": 3040 + }, + { + "epoch": 0.5647168059424327, + "grad_norm": 1.4173521995544434, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8729579448699951, + "num_tokens": 110786681.0, + "step": 3041 + }, + { + "epoch": 0.5649025069637883, + "grad_norm": 1.476662039756775, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8844312429428101, + "num_tokens": 110818943.0, + "step": 3042 + }, + { + "epoch": 0.5650882079851439, + "grad_norm": 1.3062459230422974, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8598571419715881, + "num_tokens": 110865769.0, + "step": 3043 + }, + { + "epoch": 0.5652739090064995, + "grad_norm": 1.3927760124206543, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8699716329574585, + "num_tokens": 110904548.0, + "step": 3044 + }, + { + "epoch": 0.5654596100278552, + "grad_norm": 1.6547526121139526, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8438550233840942, + "num_tokens": 110941172.0, + "step": 3045 + }, + { + "epoch": 0.5656453110492108, + "grad_norm": 1.4491742849349976, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8657348155975342, + "num_tokens": 110981502.0, + "step": 3046 + }, + { + "epoch": 0.5658310120705664, + "grad_norm": 1.4619320631027222, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8647438883781433, + "num_tokens": 111020142.0, + "step": 3047 + }, + { + "epoch": 0.566016713091922, + "grad_norm": 1.6809024810791016, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8732404708862305, + "num_tokens": 111054502.0, + "step": 3048 + }, + { + "epoch": 0.5662024141132777, + "grad_norm": 1.3928322792053223, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8751335144042969, + "num_tokens": 111096174.0, + "step": 3049 + }, + { + "epoch": 0.5663881151346333, + "grad_norm": 1.5553886890411377, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8657916188240051, + "num_tokens": 111133684.0, + "step": 3050 + }, + { + "epoch": 0.5665738161559889, + "grad_norm": 1.6665459871292114, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8754717111587524, + "num_tokens": 111163680.0, + "step": 3051 + }, + { + "epoch": 0.5667595171773445, + "grad_norm": 1.588852047920227, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8615481853485107, + "num_tokens": 111197581.0, + "step": 3052 + }, + { + "epoch": 0.5669452181987001, + "grad_norm": 1.7061684131622314, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8398339748382568, + "num_tokens": 111234351.0, + "step": 3053 + }, + { + "epoch": 0.5671309192200558, + "grad_norm": 1.6547480821609497, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8528389930725098, + "num_tokens": 111269328.0, + "step": 3054 + }, + { + "epoch": 0.5673166202414113, + "grad_norm": 1.6245408058166504, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8370592594146729, + "num_tokens": 111304220.0, + "step": 3055 + }, + { + "epoch": 0.5675023212627669, + "grad_norm": 1.5967929363250732, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8668632507324219, + "num_tokens": 111338089.0, + "step": 3056 + }, + { + "epoch": 0.5676880222841225, + "grad_norm": 1.5820447206497192, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8564146757125854, + "num_tokens": 111370595.0, + "step": 3057 + }, + { + "epoch": 0.5678737233054781, + "grad_norm": 1.5440316200256348, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8433587551116943, + "num_tokens": 111409884.0, + "step": 3058 + }, + { + "epoch": 0.5680594243268338, + "grad_norm": 1.7296984195709229, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8444536924362183, + "num_tokens": 111441864.0, + "step": 3059 + }, + { + "epoch": 0.5682451253481894, + "grad_norm": 1.411160945892334, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.870857834815979, + "num_tokens": 111479141.0, + "step": 3060 + }, + { + "epoch": 0.568430826369545, + "grad_norm": 1.623034119606018, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8562412261962891, + "num_tokens": 111516125.0, + "step": 3061 + }, + { + "epoch": 0.5686165273909006, + "grad_norm": 1.6593257188796997, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8635355830192566, + "num_tokens": 111548593.0, + "step": 3062 + }, + { + "epoch": 0.5688022284122562, + "grad_norm": 1.449339509010315, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8757093548774719, + "num_tokens": 111587639.0, + "step": 3063 + }, + { + "epoch": 0.5689879294336119, + "grad_norm": 1.4412819147109985, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8705414533615112, + "num_tokens": 111626445.0, + "step": 3064 + }, + { + "epoch": 0.5691736304549675, + "grad_norm": 1.4957311153411865, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8715316653251648, + "num_tokens": 111665724.0, + "step": 3065 + }, + { + "epoch": 0.5693593314763231, + "grad_norm": 1.3901344537734985, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8567643761634827, + "num_tokens": 111712632.0, + "step": 3066 + }, + { + "epoch": 0.5695450324976787, + "grad_norm": 1.442391037940979, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.834912896156311, + "num_tokens": 111760738.0, + "step": 3067 + }, + { + "epoch": 0.5697307335190344, + "grad_norm": 1.8865160942077637, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8523203134536743, + "num_tokens": 111793431.0, + "step": 3068 + }, + { + "epoch": 0.56991643454039, + "grad_norm": 1.5402216911315918, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8657926321029663, + "num_tokens": 111827527.0, + "step": 3069 + }, + { + "epoch": 0.5701021355617456, + "grad_norm": 1.505563497543335, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8532181978225708, + "num_tokens": 111870537.0, + "step": 3070 + }, + { + "epoch": 0.5702878365831012, + "grad_norm": 1.4802137613296509, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8732732534408569, + "num_tokens": 111906755.0, + "step": 3071 + }, + { + "epoch": 0.5704735376044568, + "grad_norm": 1.3752849102020264, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8684437274932861, + "num_tokens": 111950433.0, + "step": 3072 + }, + { + "epoch": 0.5706592386258125, + "grad_norm": 1.5111291408538818, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8533034324645996, + "num_tokens": 111989654.0, + "step": 3073 + }, + { + "epoch": 0.5708449396471681, + "grad_norm": 1.6073369979858398, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.85419762134552, + "num_tokens": 112026117.0, + "step": 3074 + }, + { + "epoch": 0.5710306406685237, + "grad_norm": 1.4417237043380737, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8640065789222717, + "num_tokens": 112062855.0, + "step": 3075 + }, + { + "epoch": 0.5712163416898793, + "grad_norm": 1.4305471181869507, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8716026544570923, + "num_tokens": 112103473.0, + "step": 3076 + }, + { + "epoch": 0.571402042711235, + "grad_norm": 1.5454356670379639, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8643199801445007, + "num_tokens": 112140136.0, + "step": 3077 + }, + { + "epoch": 0.5715877437325906, + "grad_norm": 1.5136390924453735, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8623778223991394, + "num_tokens": 112178958.0, + "step": 3078 + }, + { + "epoch": 0.5717734447539461, + "grad_norm": 1.4671339988708496, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8512489199638367, + "num_tokens": 112218436.0, + "step": 3079 + }, + { + "epoch": 0.5719591457753017, + "grad_norm": 1.4769779443740845, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8493158221244812, + "num_tokens": 112258086.0, + "step": 3080 + }, + { + "epoch": 0.5721448467966573, + "grad_norm": 1.4773776531219482, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8815796375274658, + "num_tokens": 112293575.0, + "step": 3081 + }, + { + "epoch": 0.572330547818013, + "grad_norm": 1.64491868019104, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8694379925727844, + "num_tokens": 112323693.0, + "step": 3082 + }, + { + "epoch": 0.5725162488393686, + "grad_norm": 1.394768238067627, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.875139057636261, + "num_tokens": 112364771.0, + "step": 3083 + }, + { + "epoch": 0.5727019498607242, + "grad_norm": 1.4670897722244263, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8704628944396973, + "num_tokens": 112403978.0, + "step": 3084 + }, + { + "epoch": 0.5728876508820798, + "grad_norm": 1.6701915264129639, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8549588322639465, + "num_tokens": 112435904.0, + "step": 3085 + }, + { + "epoch": 0.5730733519034354, + "grad_norm": 1.5972992181777954, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8691439628601074, + "num_tokens": 112470850.0, + "step": 3086 + }, + { + "epoch": 0.5732590529247911, + "grad_norm": 1.4550673961639404, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8841143846511841, + "num_tokens": 112509980.0, + "step": 3087 + }, + { + "epoch": 0.5734447539461467, + "grad_norm": 1.5610008239746094, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.862944483757019, + "num_tokens": 112546176.0, + "step": 3088 + }, + { + "epoch": 0.5736304549675023, + "grad_norm": 1.476773977279663, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8642895221710205, + "num_tokens": 112585661.0, + "step": 3089 + }, + { + "epoch": 0.5738161559888579, + "grad_norm": 1.4999974966049194, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8707984089851379, + "num_tokens": 112620424.0, + "step": 3090 + }, + { + "epoch": 0.5740018570102136, + "grad_norm": 1.596173882484436, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.842617928981781, + "num_tokens": 112654788.0, + "step": 3091 + }, + { + "epoch": 0.5741875580315692, + "grad_norm": 1.5889606475830078, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8606094121932983, + "num_tokens": 112691027.0, + "step": 3092 + }, + { + "epoch": 0.5743732590529248, + "grad_norm": 1.611738920211792, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8471524715423584, + "num_tokens": 112728707.0, + "step": 3093 + }, + { + "epoch": 0.5745589600742804, + "grad_norm": 1.5624423027038574, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8759149312973022, + "num_tokens": 112760541.0, + "step": 3094 + }, + { + "epoch": 0.574744661095636, + "grad_norm": 1.556527853012085, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8662644028663635, + "num_tokens": 112795263.0, + "step": 3095 + }, + { + "epoch": 0.5749303621169917, + "grad_norm": 1.5758978128433228, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8710756301879883, + "num_tokens": 112829070.0, + "step": 3096 + }, + { + "epoch": 0.5751160631383473, + "grad_norm": 1.576817274093628, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8660393953323364, + "num_tokens": 112865136.0, + "step": 3097 + }, + { + "epoch": 0.5753017641597029, + "grad_norm": 1.5081313848495483, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8773553371429443, + "num_tokens": 112900395.0, + "step": 3098 + }, + { + "epoch": 0.5754874651810585, + "grad_norm": 1.5492489337921143, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8818564414978027, + "num_tokens": 112929503.0, + "step": 3099 + }, + { + "epoch": 0.5756731662024142, + "grad_norm": 1.5983853340148926, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8572746515274048, + "num_tokens": 112966233.0, + "step": 3100 + }, + { + "epoch": 0.5758588672237698, + "grad_norm": 1.537380337715149, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.862496554851532, + "num_tokens": 113003514.0, + "step": 3101 + }, + { + "epoch": 0.5760445682451254, + "grad_norm": 1.56275475025177, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8650195002555847, + "num_tokens": 113037330.0, + "step": 3102 + }, + { + "epoch": 0.576230269266481, + "grad_norm": 1.5221893787384033, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8560836315155029, + "num_tokens": 113073295.0, + "step": 3103 + }, + { + "epoch": 0.5764159702878365, + "grad_norm": 1.5093414783477783, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8494150638580322, + "num_tokens": 113111900.0, + "step": 3104 + }, + { + "epoch": 0.5766016713091922, + "grad_norm": 1.3715801239013672, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8675209879875183, + "num_tokens": 113153087.0, + "step": 3105 + }, + { + "epoch": 0.5767873723305478, + "grad_norm": 1.4542723894119263, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8790403008460999, + "num_tokens": 113192385.0, + "step": 3106 + }, + { + "epoch": 0.5769730733519034, + "grad_norm": 1.4408632516860962, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8779699206352234, + "num_tokens": 113229933.0, + "step": 3107 + }, + { + "epoch": 0.577158774373259, + "grad_norm": 1.480176329612732, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8522195219993591, + "num_tokens": 113270613.0, + "step": 3108 + }, + { + "epoch": 0.5773444753946146, + "grad_norm": 1.5600422620773315, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8622937202453613, + "num_tokens": 113307864.0, + "step": 3109 + }, + { + "epoch": 0.5775301764159703, + "grad_norm": 1.6693789958953857, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8709982633590698, + "num_tokens": 113340613.0, + "step": 3110 + }, + { + "epoch": 0.5777158774373259, + "grad_norm": 1.6288481950759888, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8624668717384338, + "num_tokens": 113375926.0, + "step": 3111 + }, + { + "epoch": 0.5779015784586815, + "grad_norm": 1.3863208293914795, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8615272045135498, + "num_tokens": 113418412.0, + "step": 3112 + }, + { + "epoch": 0.5780872794800371, + "grad_norm": 1.6797178983688354, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8722307682037354, + "num_tokens": 113448519.0, + "step": 3113 + }, + { + "epoch": 0.5782729805013928, + "grad_norm": 1.629267930984497, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8494104146957397, + "num_tokens": 113484470.0, + "step": 3114 + }, + { + "epoch": 0.5784586815227484, + "grad_norm": 1.7444688081741333, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8445626497268677, + "num_tokens": 113517542.0, + "step": 3115 + }, + { + "epoch": 0.578644382544104, + "grad_norm": 1.6481951475143433, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8686206340789795, + "num_tokens": 113548085.0, + "step": 3116 + }, + { + "epoch": 0.5788300835654596, + "grad_norm": 1.4664268493652344, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8793026208877563, + "num_tokens": 113586557.0, + "step": 3117 + }, + { + "epoch": 0.5790157845868152, + "grad_norm": 1.5144802331924438, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8836954236030579, + "num_tokens": 113619798.0, + "step": 3118 + }, + { + "epoch": 0.5792014856081709, + "grad_norm": 1.4293317794799805, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8660649061203003, + "num_tokens": 113661718.0, + "step": 3119 + }, + { + "epoch": 0.5793871866295265, + "grad_norm": 1.6094443798065186, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8603024482727051, + "num_tokens": 113690740.0, + "step": 3120 + }, + { + "epoch": 0.5795728876508821, + "grad_norm": 1.513089656829834, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8608843088150024, + "num_tokens": 113730913.0, + "step": 3121 + }, + { + "epoch": 0.5797585886722377, + "grad_norm": 1.6154282093048096, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.869695246219635, + "num_tokens": 113765119.0, + "step": 3122 + }, + { + "epoch": 0.5799442896935934, + "grad_norm": 1.5526747703552246, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8833056688308716, + "num_tokens": 113796686.0, + "step": 3123 + }, + { + "epoch": 0.580129990714949, + "grad_norm": 1.4574686288833618, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8647027015686035, + "num_tokens": 113834113.0, + "step": 3124 + }, + { + "epoch": 0.5803156917363046, + "grad_norm": 1.4486629962921143, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8641617894172668, + "num_tokens": 113874634.0, + "step": 3125 + }, + { + "epoch": 0.5805013927576602, + "grad_norm": 1.642516016960144, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8484264016151428, + "num_tokens": 113909801.0, + "step": 3126 + }, + { + "epoch": 0.5806870937790158, + "grad_norm": 1.4963897466659546, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8658542633056641, + "num_tokens": 113945349.0, + "step": 3127 + }, + { + "epoch": 0.5808727948003714, + "grad_norm": 1.5330867767333984, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8662726879119873, + "num_tokens": 113980696.0, + "step": 3128 + }, + { + "epoch": 0.581058495821727, + "grad_norm": 1.4260551929473877, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8514121174812317, + "num_tokens": 114024002.0, + "step": 3129 + }, + { + "epoch": 0.5812441968430826, + "grad_norm": 1.4661864042282104, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8731702566146851, + "num_tokens": 114059973.0, + "step": 3130 + }, + { + "epoch": 0.5814298978644382, + "grad_norm": 1.8598238229751587, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8451102375984192, + "num_tokens": 114088142.0, + "step": 3131 + }, + { + "epoch": 0.5816155988857938, + "grad_norm": 1.5313109159469604, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8602656126022339, + "num_tokens": 114124111.0, + "step": 3132 + }, + { + "epoch": 0.5818012999071495, + "grad_norm": 1.4109563827514648, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8773896098136902, + "num_tokens": 114166815.0, + "step": 3133 + }, + { + "epoch": 0.5819870009285051, + "grad_norm": 1.6417372226715088, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8598501682281494, + "num_tokens": 114200669.0, + "step": 3134 + }, + { + "epoch": 0.5821727019498607, + "grad_norm": 1.5101383924484253, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8672384023666382, + "num_tokens": 114235808.0, + "step": 3135 + }, + { + "epoch": 0.5823584029712163, + "grad_norm": 1.5022625923156738, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.864475429058075, + "num_tokens": 114272888.0, + "step": 3136 + }, + { + "epoch": 0.582544103992572, + "grad_norm": 1.6682077646255493, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8690232634544373, + "num_tokens": 114305636.0, + "step": 3137 + }, + { + "epoch": 0.5827298050139276, + "grad_norm": 1.7194777727127075, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8631894588470459, + "num_tokens": 114337392.0, + "step": 3138 + }, + { + "epoch": 0.5829155060352832, + "grad_norm": 1.480392336845398, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8581727147102356, + "num_tokens": 114376663.0, + "step": 3139 + }, + { + "epoch": 0.5831012070566388, + "grad_norm": 1.5503261089324951, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8661705255508423, + "num_tokens": 114412573.0, + "step": 3140 + }, + { + "epoch": 0.5832869080779944, + "grad_norm": 1.3666990995407104, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8757956027984619, + "num_tokens": 114457565.0, + "step": 3141 + }, + { + "epoch": 0.5834726090993501, + "grad_norm": 1.4643363952636719, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8775684237480164, + "num_tokens": 114496716.0, + "step": 3142 + }, + { + "epoch": 0.5836583101207057, + "grad_norm": 1.5522983074188232, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8646644949913025, + "num_tokens": 114535870.0, + "step": 3143 + }, + { + "epoch": 0.5838440111420613, + "grad_norm": 1.6008901596069336, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8751091957092285, + "num_tokens": 114568315.0, + "step": 3144 + }, + { + "epoch": 0.5840297121634169, + "grad_norm": 1.6159394979476929, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8581609725952148, + "num_tokens": 114602677.0, + "step": 3145 + }, + { + "epoch": 0.5842154131847725, + "grad_norm": 1.470511794090271, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8675362467765808, + "num_tokens": 114645425.0, + "step": 3146 + }, + { + "epoch": 0.5844011142061282, + "grad_norm": 1.8147103786468506, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8634749054908752, + "num_tokens": 114673887.0, + "step": 3147 + }, + { + "epoch": 0.5845868152274838, + "grad_norm": 1.609886646270752, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8572905659675598, + "num_tokens": 114708007.0, + "step": 3148 + }, + { + "epoch": 0.5847725162488394, + "grad_norm": 1.4910808801651, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8603120446205139, + "num_tokens": 114747011.0, + "step": 3149 + }, + { + "epoch": 0.584958217270195, + "grad_norm": 1.4536864757537842, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8715813755989075, + "num_tokens": 114784603.0, + "step": 3150 + }, + { + "epoch": 0.5851439182915507, + "grad_norm": 1.6359117031097412, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8562549352645874, + "num_tokens": 114821267.0, + "step": 3151 + }, + { + "epoch": 0.5853296193129062, + "grad_norm": 1.4399155378341675, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.879183828830719, + "num_tokens": 114858467.0, + "step": 3152 + }, + { + "epoch": 0.5855153203342618, + "grad_norm": 1.570299744606018, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.86617511510849, + "num_tokens": 114895910.0, + "step": 3153 + }, + { + "epoch": 0.5857010213556174, + "grad_norm": 1.459328532218933, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8609251379966736, + "num_tokens": 114937641.0, + "step": 3154 + }, + { + "epoch": 0.585886722376973, + "grad_norm": 1.5133031606674194, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8522472381591797, + "num_tokens": 114974810.0, + "step": 3155 + }, + { + "epoch": 0.5860724233983287, + "grad_norm": 1.697269320487976, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8590322136878967, + "num_tokens": 115008547.0, + "step": 3156 + }, + { + "epoch": 0.5862581244196843, + "grad_norm": 1.5193572044372559, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.872455894947052, + "num_tokens": 115041940.0, + "step": 3157 + }, + { + "epoch": 0.5864438254410399, + "grad_norm": 1.5972423553466797, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8764657974243164, + "num_tokens": 115072429.0, + "step": 3158 + }, + { + "epoch": 0.5866295264623955, + "grad_norm": 1.5983147621154785, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8571046590805054, + "num_tokens": 115107391.0, + "step": 3159 + }, + { + "epoch": 0.5868152274837511, + "grad_norm": 1.4321706295013428, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8768808841705322, + "num_tokens": 115150872.0, + "step": 3160 + }, + { + "epoch": 0.5870009285051068, + "grad_norm": 1.656203269958496, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8485208749771118, + "num_tokens": 115186636.0, + "step": 3161 + }, + { + "epoch": 0.5871866295264624, + "grad_norm": 1.4547150135040283, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8666632175445557, + "num_tokens": 115228685.0, + "step": 3162 + }, + { + "epoch": 0.587372330547818, + "grad_norm": 1.6825989484786987, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8780875205993652, + "num_tokens": 115254925.0, + "step": 3163 + }, + { + "epoch": 0.5875580315691736, + "grad_norm": 1.6327176094055176, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.870849072933197, + "num_tokens": 115292834.0, + "step": 3164 + }, + { + "epoch": 0.5877437325905293, + "grad_norm": 1.6280086040496826, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8656141757965088, + "num_tokens": 115332158.0, + "step": 3165 + }, + { + "epoch": 0.5879294336118849, + "grad_norm": 1.5626919269561768, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8653466701507568, + "num_tokens": 115365720.0, + "step": 3166 + }, + { + "epoch": 0.5881151346332405, + "grad_norm": 1.4989557266235352, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8697842359542847, + "num_tokens": 115403900.0, + "step": 3167 + }, + { + "epoch": 0.5883008356545961, + "grad_norm": 1.6413486003875732, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8552297353744507, + "num_tokens": 115436650.0, + "step": 3168 + }, + { + "epoch": 0.5884865366759517, + "grad_norm": 1.4929410219192505, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8697876334190369, + "num_tokens": 115476502.0, + "step": 3169 + }, + { + "epoch": 0.5886722376973074, + "grad_norm": 1.5354374647140503, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8707461357116699, + "num_tokens": 115511715.0, + "step": 3170 + }, + { + "epoch": 0.588857938718663, + "grad_norm": 1.5322264432907104, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8776161670684814, + "num_tokens": 115547102.0, + "step": 3171 + }, + { + "epoch": 0.5890436397400186, + "grad_norm": 1.6246954202651978, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8392291069030762, + "num_tokens": 115586647.0, + "step": 3172 + }, + { + "epoch": 0.5892293407613742, + "grad_norm": 1.4229907989501953, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8756425380706787, + "num_tokens": 115624606.0, + "step": 3173 + }, + { + "epoch": 0.5894150417827299, + "grad_norm": 1.482404351234436, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8670178651809692, + "num_tokens": 115660430.0, + "step": 3174 + }, + { + "epoch": 0.5896007428040855, + "grad_norm": 1.5087437629699707, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8658515214920044, + "num_tokens": 115696956.0, + "step": 3175 + }, + { + "epoch": 0.589786443825441, + "grad_norm": 1.461512804031372, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8759588003158569, + "num_tokens": 115735176.0, + "step": 3176 + }, + { + "epoch": 0.5899721448467966, + "grad_norm": 1.5422383546829224, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8670673966407776, + "num_tokens": 115769197.0, + "step": 3177 + }, + { + "epoch": 0.5901578458681522, + "grad_norm": 1.693083643913269, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8668310642242432, + "num_tokens": 115801809.0, + "step": 3178 + }, + { + "epoch": 0.5903435468895079, + "grad_norm": 1.7967262268066406, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8692383766174316, + "num_tokens": 115833287.0, + "step": 3179 + }, + { + "epoch": 0.5905292479108635, + "grad_norm": 1.6054236888885498, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8605821132659912, + "num_tokens": 115869274.0, + "step": 3180 + }, + { + "epoch": 0.5907149489322191, + "grad_norm": 1.5160554647445679, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.860363245010376, + "num_tokens": 115904037.0, + "step": 3181 + }, + { + "epoch": 0.5909006499535747, + "grad_norm": 1.618047833442688, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.867544412612915, + "num_tokens": 115936110.0, + "step": 3182 + }, + { + "epoch": 0.5910863509749303, + "grad_norm": 1.4296607971191406, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8749915361404419, + "num_tokens": 115973523.0, + "step": 3183 + }, + { + "epoch": 0.591272051996286, + "grad_norm": 1.493850588798523, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8802237510681152, + "num_tokens": 116013248.0, + "step": 3184 + }, + { + "epoch": 0.5914577530176416, + "grad_norm": 1.5207411050796509, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8645364046096802, + "num_tokens": 116047148.0, + "step": 3185 + }, + { + "epoch": 0.5916434540389972, + "grad_norm": 1.491226077079773, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8630802631378174, + "num_tokens": 116084494.0, + "step": 3186 + }, + { + "epoch": 0.5918291550603528, + "grad_norm": 1.4746677875518799, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8600214719772339, + "num_tokens": 116127951.0, + "step": 3187 + }, + { + "epoch": 0.5920148560817085, + "grad_norm": 1.379389762878418, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8685191869735718, + "num_tokens": 116171307.0, + "step": 3188 + }, + { + "epoch": 0.5922005571030641, + "grad_norm": 1.5777854919433594, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8652547001838684, + "num_tokens": 116206981.0, + "step": 3189 + }, + { + "epoch": 0.5923862581244197, + "grad_norm": 1.4625635147094727, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8574163913726807, + "num_tokens": 116247334.0, + "step": 3190 + }, + { + "epoch": 0.5925719591457753, + "grad_norm": 1.6436485052108765, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8547407984733582, + "num_tokens": 116283982.0, + "step": 3191 + }, + { + "epoch": 0.5927576601671309, + "grad_norm": 1.669346570968628, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8581660985946655, + "num_tokens": 116317313.0, + "step": 3192 + }, + { + "epoch": 0.5929433611884866, + "grad_norm": 1.4922716617584229, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8628001809120178, + "num_tokens": 116355780.0, + "step": 3193 + }, + { + "epoch": 0.5931290622098422, + "grad_norm": 1.531484603881836, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8588334321975708, + "num_tokens": 116393437.0, + "step": 3194 + }, + { + "epoch": 0.5933147632311978, + "grad_norm": 1.5511431694030762, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8641546964645386, + "num_tokens": 116428667.0, + "step": 3195 + }, + { + "epoch": 0.5935004642525534, + "grad_norm": 1.5596779584884644, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.872065544128418, + "num_tokens": 116464385.0, + "step": 3196 + }, + { + "epoch": 0.593686165273909, + "grad_norm": 1.6847221851348877, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8696714639663696, + "num_tokens": 116497521.0, + "step": 3197 + }, + { + "epoch": 0.5938718662952647, + "grad_norm": 1.678105354309082, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8616088628768921, + "num_tokens": 116528143.0, + "step": 3198 + }, + { + "epoch": 0.5940575673166203, + "grad_norm": 1.4851572513580322, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8589687943458557, + "num_tokens": 116565539.0, + "step": 3199 + }, + { + "epoch": 0.5942432683379758, + "grad_norm": 1.4744715690612793, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8760381937026978, + "num_tokens": 116601701.0, + "step": 3200 + }, + { + "epoch": 0.5944289693593314, + "grad_norm": 1.483484148979187, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8698620796203613, + "num_tokens": 116639678.0, + "step": 3201 + }, + { + "epoch": 0.594614670380687, + "grad_norm": 1.6243029832839966, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8724532127380371, + "num_tokens": 116677435.0, + "step": 3202 + }, + { + "epoch": 0.5948003714020427, + "grad_norm": 1.6483393907546997, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8539119958877563, + "num_tokens": 116713145.0, + "step": 3203 + }, + { + "epoch": 0.5949860724233983, + "grad_norm": 1.490212321281433, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8526155948638916, + "num_tokens": 116757991.0, + "step": 3204 + }, + { + "epoch": 0.5951717734447539, + "grad_norm": 1.576116681098938, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8655945062637329, + "num_tokens": 116795189.0, + "step": 3205 + }, + { + "epoch": 0.5953574744661095, + "grad_norm": 1.545667290687561, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8586152195930481, + "num_tokens": 116833019.0, + "step": 3206 + }, + { + "epoch": 0.5955431754874652, + "grad_norm": 1.3220206499099731, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8866817951202393, + "num_tokens": 116875460.0, + "step": 3207 + }, + { + "epoch": 0.5957288765088208, + "grad_norm": 1.6584999561309814, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8631869554519653, + "num_tokens": 116907274.0, + "step": 3208 + }, + { + "epoch": 0.5959145775301764, + "grad_norm": 1.912194848060608, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8592950701713562, + "num_tokens": 116940362.0, + "step": 3209 + }, + { + "epoch": 0.596100278551532, + "grad_norm": 1.5953737497329712, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8632951378822327, + "num_tokens": 116976247.0, + "step": 3210 + }, + { + "epoch": 0.5962859795728876, + "grad_norm": 1.3956868648529053, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8620938658714294, + "num_tokens": 117018916.0, + "step": 3211 + }, + { + "epoch": 0.5964716805942433, + "grad_norm": 1.7860281467437744, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8678524494171143, + "num_tokens": 117044336.0, + "step": 3212 + }, + { + "epoch": 0.5966573816155989, + "grad_norm": 1.5587722063064575, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8539793491363525, + "num_tokens": 117078697.0, + "step": 3213 + }, + { + "epoch": 0.5968430826369545, + "grad_norm": 1.4101296663284302, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8691748976707458, + "num_tokens": 117118299.0, + "step": 3214 + }, + { + "epoch": 0.5970287836583101, + "grad_norm": 1.5561987161636353, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8717919588088989, + "num_tokens": 117154984.0, + "step": 3215 + }, + { + "epoch": 0.5972144846796658, + "grad_norm": 1.5712943077087402, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8661144971847534, + "num_tokens": 117189748.0, + "step": 3216 + }, + { + "epoch": 0.5974001857010214, + "grad_norm": 1.6601639986038208, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8661656975746155, + "num_tokens": 117223095.0, + "step": 3217 + }, + { + "epoch": 0.597585886722377, + "grad_norm": 1.4577569961547852, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8654025793075562, + "num_tokens": 117263555.0, + "step": 3218 + }, + { + "epoch": 0.5977715877437326, + "grad_norm": 1.5851880311965942, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8518112301826477, + "num_tokens": 117302704.0, + "step": 3219 + }, + { + "epoch": 0.5979572887650882, + "grad_norm": 1.606682300567627, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8740566372871399, + "num_tokens": 117337141.0, + "step": 3220 + }, + { + "epoch": 0.5981429897864439, + "grad_norm": 1.5720256567001343, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8533827066421509, + "num_tokens": 117377622.0, + "step": 3221 + }, + { + "epoch": 0.5983286908077995, + "grad_norm": 1.475776195526123, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8721950054168701, + "num_tokens": 117416238.0, + "step": 3222 + }, + { + "epoch": 0.5985143918291551, + "grad_norm": 1.3420532941818237, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8731381893157959, + "num_tokens": 117460012.0, + "step": 3223 + }, + { + "epoch": 0.5987000928505106, + "grad_norm": 1.6077752113342285, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8641597628593445, + "num_tokens": 117491920.0, + "step": 3224 + }, + { + "epoch": 0.5988857938718662, + "grad_norm": 1.5557823181152344, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8651200532913208, + "num_tokens": 117529101.0, + "step": 3225 + }, + { + "epoch": 0.5990714948932219, + "grad_norm": 1.6757457256317139, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8544498682022095, + "num_tokens": 117561894.0, + "step": 3226 + }, + { + "epoch": 0.5992571959145775, + "grad_norm": 1.7753461599349976, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8656151294708252, + "num_tokens": 117592357.0, + "step": 3227 + }, + { + "epoch": 0.5994428969359331, + "grad_norm": 1.5973820686340332, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8636656999588013, + "num_tokens": 117626515.0, + "step": 3228 + }, + { + "epoch": 0.5996285979572887, + "grad_norm": 1.5135375261306763, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8707358241081238, + "num_tokens": 117666150.0, + "step": 3229 + }, + { + "epoch": 0.5998142989786444, + "grad_norm": 1.4910411834716797, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8840562105178833, + "num_tokens": 117701315.0, + "step": 3230 + }, + { + "epoch": 0.6, + "grad_norm": 1.6481688022613525, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8452755808830261, + "num_tokens": 117735266.0, + "step": 3231 + }, + { + "epoch": 0.6001857010213556, + "grad_norm": 1.675748348236084, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8515772819519043, + "num_tokens": 117766945.0, + "step": 3232 + }, + { + "epoch": 0.6003714020427112, + "grad_norm": 1.6226288080215454, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8635099530220032, + "num_tokens": 117808137.0, + "step": 3233 + }, + { + "epoch": 0.6005571030640668, + "grad_norm": 1.5477573871612549, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8702119588851929, + "num_tokens": 117845527.0, + "step": 3234 + }, + { + "epoch": 0.6007428040854225, + "grad_norm": 1.5306237936019897, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8646471500396729, + "num_tokens": 117880018.0, + "step": 3235 + }, + { + "epoch": 0.6009285051067781, + "grad_norm": 1.472484827041626, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8714315295219421, + "num_tokens": 117920340.0, + "step": 3236 + }, + { + "epoch": 0.6011142061281337, + "grad_norm": 1.4647164344787598, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8616654872894287, + "num_tokens": 117961092.0, + "step": 3237 + }, + { + "epoch": 0.6012999071494893, + "grad_norm": 1.4851953983306885, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8644745349884033, + "num_tokens": 118000900.0, + "step": 3238 + }, + { + "epoch": 0.601485608170845, + "grad_norm": 1.4337670803070068, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8670766949653625, + "num_tokens": 118040959.0, + "step": 3239 + }, + { + "epoch": 0.6016713091922006, + "grad_norm": 1.5891832113265991, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8499841690063477, + "num_tokens": 118077552.0, + "step": 3240 + }, + { + "epoch": 0.6018570102135562, + "grad_norm": 1.5594844818115234, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8703949451446533, + "num_tokens": 118115406.0, + "step": 3241 + }, + { + "epoch": 0.6020427112349118, + "grad_norm": 1.535531759262085, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8539782762527466, + "num_tokens": 118150327.0, + "step": 3242 + }, + { + "epoch": 0.6022284122562674, + "grad_norm": 1.5800845623016357, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8668205142021179, + "num_tokens": 118184678.0, + "step": 3243 + }, + { + "epoch": 0.6024141132776231, + "grad_norm": 1.622839331626892, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8615297079086304, + "num_tokens": 118217847.0, + "step": 3244 + }, + { + "epoch": 0.6025998142989787, + "grad_norm": 1.54062819480896, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8735333681106567, + "num_tokens": 118252017.0, + "step": 3245 + }, + { + "epoch": 0.6027855153203343, + "grad_norm": 1.5112504959106445, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8771407008171082, + "num_tokens": 118289298.0, + "step": 3246 + }, + { + "epoch": 0.6029712163416899, + "grad_norm": 1.5007785558700562, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8684220314025879, + "num_tokens": 118326142.0, + "step": 3247 + }, + { + "epoch": 0.6031569173630454, + "grad_norm": 1.5699163675308228, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8629163503646851, + "num_tokens": 118358068.0, + "step": 3248 + }, + { + "epoch": 0.6033426183844011, + "grad_norm": 1.5018409490585327, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8664966821670532, + "num_tokens": 118395009.0, + "step": 3249 + }, + { + "epoch": 0.6035283194057567, + "grad_norm": 1.4486933946609497, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.861274242401123, + "num_tokens": 118434810.0, + "step": 3250 + }, + { + "epoch": 0.6037140204271123, + "grad_norm": 1.472444772720337, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.872870147228241, + "num_tokens": 118472344.0, + "step": 3251 + }, + { + "epoch": 0.6038997214484679, + "grad_norm": 1.6970278024673462, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8720741271972656, + "num_tokens": 118503247.0, + "step": 3252 + }, + { + "epoch": 0.6040854224698236, + "grad_norm": 1.4912103414535522, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8510924577713013, + "num_tokens": 118545542.0, + "step": 3253 + }, + { + "epoch": 0.6042711234911792, + "grad_norm": 1.5684983730316162, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8607015013694763, + "num_tokens": 118580138.0, + "step": 3254 + }, + { + "epoch": 0.6044568245125348, + "grad_norm": 1.5524593591690063, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.857835054397583, + "num_tokens": 118616282.0, + "step": 3255 + }, + { + "epoch": 0.6046425255338904, + "grad_norm": 1.4697277545928955, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8701624274253845, + "num_tokens": 118650448.0, + "step": 3256 + }, + { + "epoch": 0.604828226555246, + "grad_norm": 1.640947699546814, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8508836030960083, + "num_tokens": 118685800.0, + "step": 3257 + }, + { + "epoch": 0.6050139275766017, + "grad_norm": 1.4872443675994873, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8782949447631836, + "num_tokens": 118720895.0, + "step": 3258 + }, + { + "epoch": 0.6051996285979573, + "grad_norm": 1.6098984479904175, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8465422987937927, + "num_tokens": 118758923.0, + "step": 3259 + }, + { + "epoch": 0.6053853296193129, + "grad_norm": 1.470900297164917, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8619552850723267, + "num_tokens": 118795431.0, + "step": 3260 + }, + { + "epoch": 0.6055710306406685, + "grad_norm": 1.5200005769729614, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.857745885848999, + "num_tokens": 118832754.0, + "step": 3261 + }, + { + "epoch": 0.6057567316620242, + "grad_norm": 1.4949702024459839, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8702689409255981, + "num_tokens": 118867474.0, + "step": 3262 + }, + { + "epoch": 0.6059424326833798, + "grad_norm": 1.625575304031372, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8741534948348999, + "num_tokens": 118898838.0, + "step": 3263 + }, + { + "epoch": 0.6061281337047354, + "grad_norm": 1.6096398830413818, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8547750115394592, + "num_tokens": 118931379.0, + "step": 3264 + }, + { + "epoch": 0.606313834726091, + "grad_norm": 1.5966243743896484, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8651556968688965, + "num_tokens": 118966427.0, + "step": 3265 + }, + { + "epoch": 0.6064995357474466, + "grad_norm": 1.4747716188430786, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.864209771156311, + "num_tokens": 119008566.0, + "step": 3266 + }, + { + "epoch": 0.6066852367688023, + "grad_norm": 1.7045444250106812, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8622249364852905, + "num_tokens": 119041521.0, + "step": 3267 + }, + { + "epoch": 0.6068709377901579, + "grad_norm": 1.5577236413955688, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.842692494392395, + "num_tokens": 119080896.0, + "step": 3268 + }, + { + "epoch": 0.6070566388115135, + "grad_norm": 1.4903905391693115, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8679460287094116, + "num_tokens": 119120234.0, + "step": 3269 + }, + { + "epoch": 0.6072423398328691, + "grad_norm": 1.4988617897033691, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8623155355453491, + "num_tokens": 119156903.0, + "step": 3270 + }, + { + "epoch": 0.6074280408542247, + "grad_norm": 1.5985506772994995, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8793657422065735, + "num_tokens": 119186580.0, + "step": 3271 + }, + { + "epoch": 0.6076137418755804, + "grad_norm": 1.4908251762390137, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8751801252365112, + "num_tokens": 119223646.0, + "step": 3272 + }, + { + "epoch": 0.6077994428969359, + "grad_norm": 1.432882308959961, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8758577108383179, + "num_tokens": 119262176.0, + "step": 3273 + }, + { + "epoch": 0.6079851439182915, + "grad_norm": 1.4641166925430298, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8671034574508667, + "num_tokens": 119301277.0, + "step": 3274 + }, + { + "epoch": 0.6081708449396471, + "grad_norm": 1.6563154458999634, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8622550368309021, + "num_tokens": 119336450.0, + "step": 3275 + }, + { + "epoch": 0.6083565459610027, + "grad_norm": 1.5815467834472656, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.877403736114502, + "num_tokens": 119370930.0, + "step": 3276 + }, + { + "epoch": 0.6085422469823584, + "grad_norm": 1.4677001237869263, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8537454009056091, + "num_tokens": 119415911.0, + "step": 3277 + }, + { + "epoch": 0.608727948003714, + "grad_norm": 1.6329048871994019, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8697549104690552, + "num_tokens": 119449120.0, + "step": 3278 + }, + { + "epoch": 0.6089136490250696, + "grad_norm": 1.52005136013031, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8586468696594238, + "num_tokens": 119488120.0, + "step": 3279 + }, + { + "epoch": 0.6090993500464252, + "grad_norm": 1.3880888223648071, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8826289772987366, + "num_tokens": 119529718.0, + "step": 3280 + }, + { + "epoch": 0.6092850510677809, + "grad_norm": 1.5952574014663696, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8621671199798584, + "num_tokens": 119567685.0, + "step": 3281 + }, + { + "epoch": 0.6094707520891365, + "grad_norm": 1.556274652481079, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8650752305984497, + "num_tokens": 119605543.0, + "step": 3282 + }, + { + "epoch": 0.6096564531104921, + "grad_norm": 1.806235909461975, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8664997220039368, + "num_tokens": 119637347.0, + "step": 3283 + }, + { + "epoch": 0.6098421541318477, + "grad_norm": 1.440870761871338, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8645390272140503, + "num_tokens": 119678219.0, + "step": 3284 + }, + { + "epoch": 0.6100278551532033, + "grad_norm": 1.733069658279419, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8612388372421265, + "num_tokens": 119708510.0, + "step": 3285 + }, + { + "epoch": 0.610213556174559, + "grad_norm": 1.4154750108718872, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8594693541526794, + "num_tokens": 119752952.0, + "step": 3286 + }, + { + "epoch": 0.6103992571959146, + "grad_norm": 1.6882193088531494, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.858527660369873, + "num_tokens": 119785348.0, + "step": 3287 + }, + { + "epoch": 0.6105849582172702, + "grad_norm": 1.4917771816253662, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8671336770057678, + "num_tokens": 119823659.0, + "step": 3288 + }, + { + "epoch": 0.6107706592386258, + "grad_norm": 1.5300629138946533, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8692507147789001, + "num_tokens": 119862801.0, + "step": 3289 + }, + { + "epoch": 0.6109563602599815, + "grad_norm": 1.5972028970718384, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8684408664703369, + "num_tokens": 119897234.0, + "step": 3290 + }, + { + "epoch": 0.6111420612813371, + "grad_norm": 1.3792805671691895, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8811808824539185, + "num_tokens": 119937283.0, + "step": 3291 + }, + { + "epoch": 0.6113277623026927, + "grad_norm": 1.4385218620300293, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8720414638519287, + "num_tokens": 119977877.0, + "step": 3292 + }, + { + "epoch": 0.6115134633240483, + "grad_norm": 1.4782549142837524, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8625259399414062, + "num_tokens": 120017604.0, + "step": 3293 + }, + { + "epoch": 0.611699164345404, + "grad_norm": 1.5038797855377197, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8613196015357971, + "num_tokens": 120058492.0, + "step": 3294 + }, + { + "epoch": 0.6118848653667596, + "grad_norm": 1.552227258682251, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8710429072380066, + "num_tokens": 120092529.0, + "step": 3295 + }, + { + "epoch": 0.6120705663881152, + "grad_norm": 1.5742418766021729, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8658984899520874, + "num_tokens": 120130985.0, + "step": 3296 + }, + { + "epoch": 0.6122562674094707, + "grad_norm": 1.531030297279358, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8544628620147705, + "num_tokens": 120169101.0, + "step": 3297 + }, + { + "epoch": 0.6124419684308263, + "grad_norm": 1.5146723985671997, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8586490750312805, + "num_tokens": 120205700.0, + "step": 3298 + }, + { + "epoch": 0.6126276694521819, + "grad_norm": 1.540368914604187, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8669289350509644, + "num_tokens": 120242319.0, + "step": 3299 + }, + { + "epoch": 0.6128133704735376, + "grad_norm": 1.5607609748840332, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8679505586624146, + "num_tokens": 120276846.0, + "step": 3300 + }, + { + "epoch": 0.6129990714948932, + "grad_norm": 1.607384443283081, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8410677909851074, + "num_tokens": 120318477.0, + "step": 3301 + }, + { + "epoch": 0.6131847725162488, + "grad_norm": 1.370246410369873, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8643550872802734, + "num_tokens": 120364319.0, + "step": 3302 + }, + { + "epoch": 0.6133704735376044, + "grad_norm": 1.548531174659729, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8632339239120483, + "num_tokens": 120400056.0, + "step": 3303 + }, + { + "epoch": 0.61355617455896, + "grad_norm": 1.5761133432388306, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8579161167144775, + "num_tokens": 120435550.0, + "step": 3304 + }, + { + "epoch": 0.6137418755803157, + "grad_norm": 1.6197922229766846, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8688596487045288, + "num_tokens": 120469551.0, + "step": 3305 + }, + { + "epoch": 0.6139275766016713, + "grad_norm": 1.4285303354263306, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8763414025306702, + "num_tokens": 120505234.0, + "step": 3306 + }, + { + "epoch": 0.6141132776230269, + "grad_norm": 1.5964974164962769, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8629595041275024, + "num_tokens": 120538619.0, + "step": 3307 + }, + { + "epoch": 0.6142989786443825, + "grad_norm": 1.3706527948379517, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8831018209457397, + "num_tokens": 120577765.0, + "step": 3308 + }, + { + "epoch": 0.6144846796657382, + "grad_norm": 1.6431530714035034, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8677113056182861, + "num_tokens": 120609243.0, + "step": 3309 + }, + { + "epoch": 0.6146703806870938, + "grad_norm": 1.6073087453842163, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8456954956054688, + "num_tokens": 120646136.0, + "step": 3310 + }, + { + "epoch": 0.6148560817084494, + "grad_norm": 1.659532070159912, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8583712577819824, + "num_tokens": 120683113.0, + "step": 3311 + }, + { + "epoch": 0.615041782729805, + "grad_norm": 1.6032488346099854, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.876806378364563, + "num_tokens": 120715743.0, + "step": 3312 + }, + { + "epoch": 0.6152274837511607, + "grad_norm": 1.495078206062317, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8735074996948242, + "num_tokens": 120752261.0, + "step": 3313 + }, + { + "epoch": 0.6154131847725163, + "grad_norm": 2.155061960220337, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8539881110191345, + "num_tokens": 120777657.0, + "step": 3314 + }, + { + "epoch": 0.6155988857938719, + "grad_norm": 1.5503432750701904, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8692877292633057, + "num_tokens": 120812754.0, + "step": 3315 + }, + { + "epoch": 0.6157845868152275, + "grad_norm": 1.4422539472579956, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8750805854797363, + "num_tokens": 120854024.0, + "step": 3316 + }, + { + "epoch": 0.6159702878365831, + "grad_norm": 1.5683205127716064, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8587080836296082, + "num_tokens": 120890477.0, + "step": 3317 + }, + { + "epoch": 0.6161559888579388, + "grad_norm": 1.6150081157684326, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8605493307113647, + "num_tokens": 120924354.0, + "step": 3318 + }, + { + "epoch": 0.6163416898792944, + "grad_norm": 1.514007568359375, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8725451231002808, + "num_tokens": 120959558.0, + "step": 3319 + }, + { + "epoch": 0.61652739090065, + "grad_norm": 1.5250710248947144, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8567531108856201, + "num_tokens": 120996885.0, + "step": 3320 + }, + { + "epoch": 0.6167130919220055, + "grad_norm": 1.764768362045288, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8644876480102539, + "num_tokens": 121028312.0, + "step": 3321 + }, + { + "epoch": 0.6168987929433611, + "grad_norm": 1.4493166208267212, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8788737654685974, + "num_tokens": 121068885.0, + "step": 3322 + }, + { + "epoch": 0.6170844939647168, + "grad_norm": 1.4995861053466797, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8713564872741699, + "num_tokens": 121105224.0, + "step": 3323 + }, + { + "epoch": 0.6172701949860724, + "grad_norm": 1.6004284620285034, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8650556206703186, + "num_tokens": 121139597.0, + "step": 3324 + }, + { + "epoch": 0.617455896007428, + "grad_norm": 1.4713048934936523, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8633337020874023, + "num_tokens": 121177813.0, + "step": 3325 + }, + { + "epoch": 0.6176415970287836, + "grad_norm": 1.4053016901016235, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8697316646575928, + "num_tokens": 121217496.0, + "step": 3326 + }, + { + "epoch": 0.6178272980501393, + "grad_norm": 1.334344506263733, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.866786777973175, + "num_tokens": 121258529.0, + "step": 3327 + }, + { + "epoch": 0.6180129990714949, + "grad_norm": 1.570844292640686, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8575334548950195, + "num_tokens": 121292570.0, + "step": 3328 + }, + { + "epoch": 0.6181987000928505, + "grad_norm": 1.6174554824829102, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.86549973487854, + "num_tokens": 121325953.0, + "step": 3329 + }, + { + "epoch": 0.6183844011142061, + "grad_norm": 1.537325143814087, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8521652221679688, + "num_tokens": 121363804.0, + "step": 3330 + }, + { + "epoch": 0.6185701021355617, + "grad_norm": 1.4358762502670288, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8759415745735168, + "num_tokens": 121404737.0, + "step": 3331 + }, + { + "epoch": 0.6187558031569174, + "grad_norm": 1.5769712924957275, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8662475943565369, + "num_tokens": 121439732.0, + "step": 3332 + }, + { + "epoch": 0.618941504178273, + "grad_norm": 1.6256440877914429, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.856299638748169, + "num_tokens": 121476040.0, + "step": 3333 + }, + { + "epoch": 0.6191272051996286, + "grad_norm": 1.4801157712936401, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8494572639465332, + "num_tokens": 121513569.0, + "step": 3334 + }, + { + "epoch": 0.6193129062209842, + "grad_norm": 1.5331065654754639, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8550326824188232, + "num_tokens": 121554190.0, + "step": 3335 + }, + { + "epoch": 0.6194986072423398, + "grad_norm": 1.6397379636764526, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8594712018966675, + "num_tokens": 121588768.0, + "step": 3336 + }, + { + "epoch": 0.6196843082636955, + "grad_norm": 1.3552758693695068, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8807956576347351, + "num_tokens": 121630390.0, + "step": 3337 + }, + { + "epoch": 0.6198700092850511, + "grad_norm": 1.7663465738296509, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.864818274974823, + "num_tokens": 121662957.0, + "step": 3338 + }, + { + "epoch": 0.6200557103064067, + "grad_norm": 1.544900894165039, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8498992323875427, + "num_tokens": 121704372.0, + "step": 3339 + }, + { + "epoch": 0.6202414113277623, + "grad_norm": 1.6080639362335205, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8576050996780396, + "num_tokens": 121739420.0, + "step": 3340 + }, + { + "epoch": 0.620427112349118, + "grad_norm": 1.588828444480896, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8559973835945129, + "num_tokens": 121772486.0, + "step": 3341 + }, + { + "epoch": 0.6206128133704736, + "grad_norm": 1.68818998336792, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8704765439033508, + "num_tokens": 121808059.0, + "step": 3342 + }, + { + "epoch": 0.6207985143918292, + "grad_norm": 1.5280351638793945, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8683133721351624, + "num_tokens": 121846355.0, + "step": 3343 + }, + { + "epoch": 0.6209842154131848, + "grad_norm": 1.4472094774246216, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.861525297164917, + "num_tokens": 121884415.0, + "step": 3344 + }, + { + "epoch": 0.6211699164345403, + "grad_norm": 1.5541958808898926, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8574889898300171, + "num_tokens": 121919961.0, + "step": 3345 + }, + { + "epoch": 0.621355617455896, + "grad_norm": 1.61952543258667, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8634600639343262, + "num_tokens": 121953789.0, + "step": 3346 + }, + { + "epoch": 0.6215413184772516, + "grad_norm": 1.53961980342865, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8801299333572388, + "num_tokens": 121986263.0, + "step": 3347 + }, + { + "epoch": 0.6217270194986072, + "grad_norm": 1.412157416343689, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8709704875946045, + "num_tokens": 122024161.0, + "step": 3348 + }, + { + "epoch": 0.6219127205199628, + "grad_norm": 1.474666953086853, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8589242696762085, + "num_tokens": 122061197.0, + "step": 3349 + }, + { + "epoch": 0.6220984215413184, + "grad_norm": 1.4188379049301147, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8712880611419678, + "num_tokens": 122100282.0, + "step": 3350 + }, + { + "epoch": 0.6222841225626741, + "grad_norm": 1.5315955877304077, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.859587550163269, + "num_tokens": 122135881.0, + "step": 3351 + }, + { + "epoch": 0.6224698235840297, + "grad_norm": 1.570982575416565, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8596562743186951, + "num_tokens": 122171263.0, + "step": 3352 + }, + { + "epoch": 0.6226555246053853, + "grad_norm": 1.4363418817520142, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.877214789390564, + "num_tokens": 122208556.0, + "step": 3353 + }, + { + "epoch": 0.6228412256267409, + "grad_norm": 1.4994237422943115, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8781627416610718, + "num_tokens": 122243009.0, + "step": 3354 + }, + { + "epoch": 0.6230269266480966, + "grad_norm": 1.6141411066055298, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8509534597396851, + "num_tokens": 122279527.0, + "step": 3355 + }, + { + "epoch": 0.6232126276694522, + "grad_norm": 1.4731855392456055, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8829249143600464, + "num_tokens": 122313264.0, + "step": 3356 + }, + { + "epoch": 0.6233983286908078, + "grad_norm": 1.5893179178237915, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.866990327835083, + "num_tokens": 122345122.0, + "step": 3357 + }, + { + "epoch": 0.6235840297121634, + "grad_norm": 1.6585291624069214, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8698139190673828, + "num_tokens": 122374570.0, + "step": 3358 + }, + { + "epoch": 0.623769730733519, + "grad_norm": 1.6365011930465698, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8549852967262268, + "num_tokens": 122414640.0, + "step": 3359 + }, + { + "epoch": 0.6239554317548747, + "grad_norm": 1.7224273681640625, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8668954372406006, + "num_tokens": 122444921.0, + "step": 3360 + }, + { + "epoch": 0.6241411327762303, + "grad_norm": 1.5762970447540283, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8726834058761597, + "num_tokens": 122478184.0, + "step": 3361 + }, + { + "epoch": 0.6243268337975859, + "grad_norm": 1.5400162935256958, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8737385272979736, + "num_tokens": 122510210.0, + "step": 3362 + }, + { + "epoch": 0.6245125348189415, + "grad_norm": 1.4835718870162964, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8571387529373169, + "num_tokens": 122549506.0, + "step": 3363 + }, + { + "epoch": 0.6246982358402972, + "grad_norm": 1.523257851600647, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8653885126113892, + "num_tokens": 122591096.0, + "step": 3364 + }, + { + "epoch": 0.6248839368616528, + "grad_norm": 1.4452699422836304, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8751410245895386, + "num_tokens": 122627781.0, + "step": 3365 + }, + { + "epoch": 0.6250696378830084, + "grad_norm": 1.4875166416168213, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.865034282207489, + "num_tokens": 122671476.0, + "step": 3366 + }, + { + "epoch": 0.625255338904364, + "grad_norm": 1.519770860671997, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8844424486160278, + "num_tokens": 122706991.0, + "step": 3367 + }, + { + "epoch": 0.6254410399257196, + "grad_norm": 1.5194815397262573, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.868220329284668, + "num_tokens": 122743255.0, + "step": 3368 + }, + { + "epoch": 0.6256267409470752, + "grad_norm": 1.6129343509674072, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8623733520507812, + "num_tokens": 122775860.0, + "step": 3369 + }, + { + "epoch": 0.6258124419684308, + "grad_norm": 1.6435033082962036, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8537523746490479, + "num_tokens": 122814587.0, + "step": 3370 + }, + { + "epoch": 0.6259981429897864, + "grad_norm": 1.6502666473388672, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8546189069747925, + "num_tokens": 122847998.0, + "step": 3371 + }, + { + "epoch": 0.626183844011142, + "grad_norm": 1.567979335784912, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8624461889266968, + "num_tokens": 122885830.0, + "step": 3372 + }, + { + "epoch": 0.6263695450324976, + "grad_norm": 1.4517102241516113, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8676190376281738, + "num_tokens": 122930317.0, + "step": 3373 + }, + { + "epoch": 0.6265552460538533, + "grad_norm": 1.6705130338668823, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8528839349746704, + "num_tokens": 122963435.0, + "step": 3374 + }, + { + "epoch": 0.6267409470752089, + "grad_norm": 1.5909994840621948, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8741352558135986, + "num_tokens": 122994479.0, + "step": 3375 + }, + { + "epoch": 0.6269266480965645, + "grad_norm": 1.6306571960449219, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8490456342697144, + "num_tokens": 123031204.0, + "step": 3376 + }, + { + "epoch": 0.6271123491179201, + "grad_norm": 1.525611162185669, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8509302735328674, + "num_tokens": 123069002.0, + "step": 3377 + }, + { + "epoch": 0.6272980501392758, + "grad_norm": 1.4247441291809082, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8581708669662476, + "num_tokens": 123110751.0, + "step": 3378 + }, + { + "epoch": 0.6274837511606314, + "grad_norm": 1.6070572137832642, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8561581373214722, + "num_tokens": 123145051.0, + "step": 3379 + }, + { + "epoch": 0.627669452181987, + "grad_norm": 1.5736405849456787, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8633905053138733, + "num_tokens": 123184431.0, + "step": 3380 + }, + { + "epoch": 0.6278551532033426, + "grad_norm": 1.5639253854751587, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8542596101760864, + "num_tokens": 123220177.0, + "step": 3381 + }, + { + "epoch": 0.6280408542246982, + "grad_norm": 1.645887851715088, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8551956415176392, + "num_tokens": 123253843.0, + "step": 3382 + }, + { + "epoch": 0.6282265552460539, + "grad_norm": 1.4131907224655151, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8759114146232605, + "num_tokens": 123293977.0, + "step": 3383 + }, + { + "epoch": 0.6284122562674095, + "grad_norm": 1.6522170305252075, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8619527220726013, + "num_tokens": 123333456.0, + "step": 3384 + }, + { + "epoch": 0.6285979572887651, + "grad_norm": 1.5545846223831177, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8661211133003235, + "num_tokens": 123366916.0, + "step": 3385 + }, + { + "epoch": 0.6287836583101207, + "grad_norm": 1.627843976020813, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.870083212852478, + "num_tokens": 123399803.0, + "step": 3386 + }, + { + "epoch": 0.6289693593314764, + "grad_norm": 1.5734024047851562, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8727543950080872, + "num_tokens": 123434453.0, + "step": 3387 + }, + { + "epoch": 0.629155060352832, + "grad_norm": 1.3870775699615479, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8845658302307129, + "num_tokens": 123475998.0, + "step": 3388 + }, + { + "epoch": 0.6293407613741876, + "grad_norm": 1.6589173078536987, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.872908353805542, + "num_tokens": 123508715.0, + "step": 3389 + }, + { + "epoch": 0.6295264623955432, + "grad_norm": 1.5094857215881348, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8681625723838806, + "num_tokens": 123542839.0, + "step": 3390 + }, + { + "epoch": 0.6297121634168988, + "grad_norm": 1.6387324333190918, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8710424900054932, + "num_tokens": 123574159.0, + "step": 3391 + }, + { + "epoch": 0.6298978644382545, + "grad_norm": 1.6968662738800049, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8635851144790649, + "num_tokens": 123608949.0, + "step": 3392 + }, + { + "epoch": 0.63008356545961, + "grad_norm": 1.679236650466919, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8661019802093506, + "num_tokens": 123642356.0, + "step": 3393 + }, + { + "epoch": 0.6302692664809656, + "grad_norm": 1.509729266166687, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8684085607528687, + "num_tokens": 123681323.0, + "step": 3394 + }, + { + "epoch": 0.6304549675023212, + "grad_norm": 1.4372261762619019, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8749105930328369, + "num_tokens": 123720373.0, + "step": 3395 + }, + { + "epoch": 0.6306406685236768, + "grad_norm": 1.6394951343536377, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8561093807220459, + "num_tokens": 123757671.0, + "step": 3396 + }, + { + "epoch": 0.6308263695450325, + "grad_norm": 1.5332200527191162, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8723486661911011, + "num_tokens": 123790864.0, + "step": 3397 + }, + { + "epoch": 0.6310120705663881, + "grad_norm": 1.4161021709442139, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8725978136062622, + "num_tokens": 123827390.0, + "step": 3398 + }, + { + "epoch": 0.6311977715877437, + "grad_norm": 1.4702563285827637, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8777655363082886, + "num_tokens": 123864538.0, + "step": 3399 + }, + { + "epoch": 0.6313834726090993, + "grad_norm": 1.5607353448867798, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8695282936096191, + "num_tokens": 123899791.0, + "step": 3400 + }, + { + "epoch": 0.631569173630455, + "grad_norm": 1.5461845397949219, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.854219377040863, + "num_tokens": 123936224.0, + "step": 3401 + }, + { + "epoch": 0.6317548746518106, + "grad_norm": 1.44011652469635, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8815951347351074, + "num_tokens": 123971598.0, + "step": 3402 + }, + { + "epoch": 0.6319405756731662, + "grad_norm": 1.579603672027588, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8834237456321716, + "num_tokens": 124002742.0, + "step": 3403 + }, + { + "epoch": 0.6321262766945218, + "grad_norm": 1.658700942993164, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8455566763877869, + "num_tokens": 124041162.0, + "step": 3404 + }, + { + "epoch": 0.6323119777158774, + "grad_norm": 1.6038146018981934, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8579230308532715, + "num_tokens": 124081136.0, + "step": 3405 + }, + { + "epoch": 0.6324976787372331, + "grad_norm": 1.4453274011611938, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.870856523513794, + "num_tokens": 124123164.0, + "step": 3406 + }, + { + "epoch": 0.6326833797585887, + "grad_norm": 1.512294888496399, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8654841184616089, + "num_tokens": 124161324.0, + "step": 3407 + }, + { + "epoch": 0.6328690807799443, + "grad_norm": 1.5774576663970947, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8636462688446045, + "num_tokens": 124197640.0, + "step": 3408 + }, + { + "epoch": 0.6330547818012999, + "grad_norm": 1.4715913534164429, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8625974655151367, + "num_tokens": 124235608.0, + "step": 3409 + }, + { + "epoch": 0.6332404828226555, + "grad_norm": 1.5571839809417725, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8591353297233582, + "num_tokens": 124273623.0, + "step": 3410 + }, + { + "epoch": 0.6334261838440112, + "grad_norm": 1.3467011451721191, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8715910911560059, + "num_tokens": 124315718.0, + "step": 3411 + }, + { + "epoch": 0.6336118848653668, + "grad_norm": 1.5595190525054932, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8533480167388916, + "num_tokens": 124356020.0, + "step": 3412 + }, + { + "epoch": 0.6337975858867224, + "grad_norm": 1.505455493927002, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8539192080497742, + "num_tokens": 124395538.0, + "step": 3413 + }, + { + "epoch": 0.633983286908078, + "grad_norm": 1.4772018194198608, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8644789457321167, + "num_tokens": 124437180.0, + "step": 3414 + }, + { + "epoch": 0.6341689879294337, + "grad_norm": 1.5052475929260254, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8573503494262695, + "num_tokens": 124476895.0, + "step": 3415 + }, + { + "epoch": 0.6343546889507893, + "grad_norm": 1.537183165550232, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8696657419204712, + "num_tokens": 124512272.0, + "step": 3416 + }, + { + "epoch": 0.6345403899721448, + "grad_norm": 1.7053121328353882, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8607932925224304, + "num_tokens": 124544461.0, + "step": 3417 + }, + { + "epoch": 0.6347260909935004, + "grad_norm": 1.3614284992218018, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8802990913391113, + "num_tokens": 124587247.0, + "step": 3418 + }, + { + "epoch": 0.634911792014856, + "grad_norm": 1.525000810623169, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8775485754013062, + "num_tokens": 124621349.0, + "step": 3419 + }, + { + "epoch": 0.6350974930362117, + "grad_norm": 1.5002623796463013, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8494598269462585, + "num_tokens": 124659645.0, + "step": 3420 + }, + { + "epoch": 0.6352831940575673, + "grad_norm": 1.4894468784332275, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8691517114639282, + "num_tokens": 124698095.0, + "step": 3421 + }, + { + "epoch": 0.6354688950789229, + "grad_norm": 1.4143283367156982, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8682926893234253, + "num_tokens": 124736627.0, + "step": 3422 + }, + { + "epoch": 0.6356545961002785, + "grad_norm": 1.4995803833007812, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8684296011924744, + "num_tokens": 124776077.0, + "step": 3423 + }, + { + "epoch": 0.6358402971216341, + "grad_norm": 1.495668649673462, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8710484504699707, + "num_tokens": 124810476.0, + "step": 3424 + }, + { + "epoch": 0.6360259981429898, + "grad_norm": 1.4370336532592773, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8873699307441711, + "num_tokens": 124847031.0, + "step": 3425 + }, + { + "epoch": 0.6362116991643454, + "grad_norm": 1.4349473714828491, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8664787411689758, + "num_tokens": 124884416.0, + "step": 3426 + }, + { + "epoch": 0.636397400185701, + "grad_norm": 1.5669488906860352, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8697936534881592, + "num_tokens": 124919482.0, + "step": 3427 + }, + { + "epoch": 0.6365831012070566, + "grad_norm": 1.512624979019165, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8688467741012573, + "num_tokens": 124954787.0, + "step": 3428 + }, + { + "epoch": 0.6367688022284123, + "grad_norm": 1.5065158605575562, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8751121163368225, + "num_tokens": 124988781.0, + "step": 3429 + }, + { + "epoch": 0.6369545032497679, + "grad_norm": 1.5828564167022705, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8495362401008606, + "num_tokens": 125026170.0, + "step": 3430 + }, + { + "epoch": 0.6371402042711235, + "grad_norm": 1.472055435180664, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.868994414806366, + "num_tokens": 125062924.0, + "step": 3431 + }, + { + "epoch": 0.6373259052924791, + "grad_norm": 1.6258330345153809, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8367369771003723, + "num_tokens": 125097067.0, + "step": 3432 + }, + { + "epoch": 0.6375116063138347, + "grad_norm": 1.5614652633666992, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8695544004440308, + "num_tokens": 125131988.0, + "step": 3433 + }, + { + "epoch": 0.6376973073351904, + "grad_norm": 1.3711521625518799, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8680262565612793, + "num_tokens": 125173005.0, + "step": 3434 + }, + { + "epoch": 0.637883008356546, + "grad_norm": 1.5483808517456055, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.868324875831604, + "num_tokens": 125209280.0, + "step": 3435 + }, + { + "epoch": 0.6380687093779016, + "grad_norm": 1.3819828033447266, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8862680196762085, + "num_tokens": 125247924.0, + "step": 3436 + }, + { + "epoch": 0.6382544103992572, + "grad_norm": 1.413029432296753, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8718563318252563, + "num_tokens": 125285099.0, + "step": 3437 + }, + { + "epoch": 0.6384401114206129, + "grad_norm": 1.6958268880844116, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8606427907943726, + "num_tokens": 125319269.0, + "step": 3438 + }, + { + "epoch": 0.6386258124419685, + "grad_norm": 1.534426212310791, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8442880511283875, + "num_tokens": 125359968.0, + "step": 3439 + }, + { + "epoch": 0.6388115134633241, + "grad_norm": 1.433732271194458, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8774110078811646, + "num_tokens": 125394701.0, + "step": 3440 + }, + { + "epoch": 0.6389972144846797, + "grad_norm": 1.6684991121292114, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8479292988777161, + "num_tokens": 125429458.0, + "step": 3441 + }, + { + "epoch": 0.6391829155060352, + "grad_norm": 1.4506444931030273, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8651623725891113, + "num_tokens": 125469905.0, + "step": 3442 + }, + { + "epoch": 0.6393686165273909, + "grad_norm": 1.6200824975967407, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8762699365615845, + "num_tokens": 125501982.0, + "step": 3443 + }, + { + "epoch": 0.6395543175487465, + "grad_norm": 1.4502496719360352, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8788014650344849, + "num_tokens": 125540636.0, + "step": 3444 + }, + { + "epoch": 0.6397400185701021, + "grad_norm": 1.5982316732406616, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8678604364395142, + "num_tokens": 125574881.0, + "step": 3445 + }, + { + "epoch": 0.6399257195914577, + "grad_norm": 1.4452670812606812, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8751943111419678, + "num_tokens": 125612211.0, + "step": 3446 + }, + { + "epoch": 0.6401114206128133, + "grad_norm": 1.604884147644043, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8709745407104492, + "num_tokens": 125648891.0, + "step": 3447 + }, + { + "epoch": 0.640297121634169, + "grad_norm": 1.4169062376022339, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8662708401679993, + "num_tokens": 125690888.0, + "step": 3448 + }, + { + "epoch": 0.6404828226555246, + "grad_norm": 1.5436588525772095, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8773494362831116, + "num_tokens": 125724397.0, + "step": 3449 + }, + { + "epoch": 0.6406685236768802, + "grad_norm": 1.5469834804534912, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.845994770526886, + "num_tokens": 125761311.0, + "step": 3450 + }, + { + "epoch": 0.6408542246982358, + "grad_norm": 1.4996963739395142, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8678641319274902, + "num_tokens": 125797257.0, + "step": 3451 + }, + { + "epoch": 0.6410399257195915, + "grad_norm": 1.6278483867645264, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8647528886795044, + "num_tokens": 125831297.0, + "step": 3452 + }, + { + "epoch": 0.6412256267409471, + "grad_norm": 1.5743663311004639, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8679533004760742, + "num_tokens": 125866351.0, + "step": 3453 + }, + { + "epoch": 0.6414113277623027, + "grad_norm": 1.5787237882614136, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.863235354423523, + "num_tokens": 125899903.0, + "step": 3454 + }, + { + "epoch": 0.6415970287836583, + "grad_norm": 1.4534976482391357, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8618168830871582, + "num_tokens": 125938428.0, + "step": 3455 + }, + { + "epoch": 0.6417827298050139, + "grad_norm": 1.4418809413909912, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8506456613540649, + "num_tokens": 125981828.0, + "step": 3456 + }, + { + "epoch": 0.6419684308263696, + "grad_norm": 1.524601936340332, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.868401825428009, + "num_tokens": 126018709.0, + "step": 3457 + }, + { + "epoch": 0.6421541318477252, + "grad_norm": 1.6442323923110962, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8702896237373352, + "num_tokens": 126051480.0, + "step": 3458 + }, + { + "epoch": 0.6423398328690808, + "grad_norm": 1.7058440446853638, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8672817945480347, + "num_tokens": 126080835.0, + "step": 3459 + }, + { + "epoch": 0.6425255338904364, + "grad_norm": 1.4883009195327759, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8759689331054688, + "num_tokens": 126117731.0, + "step": 3460 + }, + { + "epoch": 0.642711234911792, + "grad_norm": 1.4968446493148804, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8716719746589661, + "num_tokens": 126157766.0, + "step": 3461 + }, + { + "epoch": 0.6428969359331477, + "grad_norm": 1.5537943840026855, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8762792348861694, + "num_tokens": 126191568.0, + "step": 3462 + }, + { + "epoch": 0.6430826369545033, + "grad_norm": 1.6828632354736328, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8740148544311523, + "num_tokens": 126219581.0, + "step": 3463 + }, + { + "epoch": 0.6432683379758589, + "grad_norm": 1.4171148538589478, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8775814175605774, + "num_tokens": 126260192.0, + "step": 3464 + }, + { + "epoch": 0.6434540389972145, + "grad_norm": 1.52595853805542, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8758500814437866, + "num_tokens": 126299458.0, + "step": 3465 + }, + { + "epoch": 0.64363974001857, + "grad_norm": 1.532192349433899, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8607661724090576, + "num_tokens": 126338390.0, + "step": 3466 + }, + { + "epoch": 0.6438254410399257, + "grad_norm": 1.7227301597595215, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8456772565841675, + "num_tokens": 126376161.0, + "step": 3467 + }, + { + "epoch": 0.6440111420612813, + "grad_norm": 1.4314427375793457, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8738914728164673, + "num_tokens": 126415614.0, + "step": 3468 + }, + { + "epoch": 0.6441968430826369, + "grad_norm": 1.8199745416641235, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8471424579620361, + "num_tokens": 126453618.0, + "step": 3469 + }, + { + "epoch": 0.6443825441039925, + "grad_norm": 1.5474382638931274, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8764927387237549, + "num_tokens": 126485282.0, + "step": 3470 + }, + { + "epoch": 0.6445682451253482, + "grad_norm": 1.51120126247406, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8771684765815735, + "num_tokens": 126518519.0, + "step": 3471 + }, + { + "epoch": 0.6447539461467038, + "grad_norm": 1.6563441753387451, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8789640665054321, + "num_tokens": 126547596.0, + "step": 3472 + }, + { + "epoch": 0.6449396471680594, + "grad_norm": 1.5566686391830444, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8747541308403015, + "num_tokens": 126583271.0, + "step": 3473 + }, + { + "epoch": 0.645125348189415, + "grad_norm": 1.529222011566162, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8607741594314575, + "num_tokens": 126622744.0, + "step": 3474 + }, + { + "epoch": 0.6453110492107706, + "grad_norm": 1.4985102415084839, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8720434904098511, + "num_tokens": 126657192.0, + "step": 3475 + }, + { + "epoch": 0.6454967502321263, + "grad_norm": 1.5372759103775024, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8602455258369446, + "num_tokens": 126696231.0, + "step": 3476 + }, + { + "epoch": 0.6456824512534819, + "grad_norm": 1.4554717540740967, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8684102892875671, + "num_tokens": 126736721.0, + "step": 3477 + }, + { + "epoch": 0.6458681522748375, + "grad_norm": 1.5003701448440552, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8707605004310608, + "num_tokens": 126776732.0, + "step": 3478 + }, + { + "epoch": 0.6460538532961931, + "grad_norm": 1.5612354278564453, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8578759431838989, + "num_tokens": 126814666.0, + "step": 3479 + }, + { + "epoch": 0.6462395543175488, + "grad_norm": 1.596898078918457, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8557155728340149, + "num_tokens": 126850155.0, + "step": 3480 + }, + { + "epoch": 0.6464252553389044, + "grad_norm": 1.5185471773147583, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8573343753814697, + "num_tokens": 126887071.0, + "step": 3481 + }, + { + "epoch": 0.64661095636026, + "grad_norm": 1.4790573120117188, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8723771572113037, + "num_tokens": 126925315.0, + "step": 3482 + }, + { + "epoch": 0.6467966573816156, + "grad_norm": 1.472366452217102, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8640455007553101, + "num_tokens": 126964261.0, + "step": 3483 + }, + { + "epoch": 0.6469823584029712, + "grad_norm": 1.5734987258911133, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.859703004360199, + "num_tokens": 127003502.0, + "step": 3484 + }, + { + "epoch": 0.6471680594243269, + "grad_norm": 1.5478706359863281, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8653672933578491, + "num_tokens": 127042032.0, + "step": 3485 + }, + { + "epoch": 0.6473537604456825, + "grad_norm": 1.4666597843170166, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8755567073822021, + "num_tokens": 127080567.0, + "step": 3486 + }, + { + "epoch": 0.6475394614670381, + "grad_norm": 1.5567824840545654, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8569463491439819, + "num_tokens": 127117345.0, + "step": 3487 + }, + { + "epoch": 0.6477251624883937, + "grad_norm": 1.485245704650879, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8811618089675903, + "num_tokens": 127153063.0, + "step": 3488 + }, + { + "epoch": 0.6479108635097494, + "grad_norm": 1.529396414756775, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8739962577819824, + "num_tokens": 127185352.0, + "step": 3489 + }, + { + "epoch": 0.6480965645311049, + "grad_norm": 1.6381580829620361, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8765931725502014, + "num_tokens": 127217997.0, + "step": 3490 + }, + { + "epoch": 0.6482822655524605, + "grad_norm": 1.441453218460083, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.869621217250824, + "num_tokens": 127255027.0, + "step": 3491 + }, + { + "epoch": 0.6484679665738161, + "grad_norm": 1.4685691595077515, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8705887794494629, + "num_tokens": 127293230.0, + "step": 3492 + }, + { + "epoch": 0.6486536675951717, + "grad_norm": 1.3908679485321045, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8618452548980713, + "num_tokens": 127335058.0, + "step": 3493 + }, + { + "epoch": 0.6488393686165274, + "grad_norm": 1.6003934144973755, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8640473484992981, + "num_tokens": 127369557.0, + "step": 3494 + }, + { + "epoch": 0.649025069637883, + "grad_norm": 1.4847607612609863, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8710583448410034, + "num_tokens": 127405375.0, + "step": 3495 + }, + { + "epoch": 0.6492107706592386, + "grad_norm": 1.5213426351547241, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8603752851486206, + "num_tokens": 127444427.0, + "step": 3496 + }, + { + "epoch": 0.6493964716805942, + "grad_norm": 1.5985029935836792, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.862769365310669, + "num_tokens": 127477785.0, + "step": 3497 + }, + { + "epoch": 0.6495821727019498, + "grad_norm": 1.589833378791809, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8685061931610107, + "num_tokens": 127509777.0, + "step": 3498 + }, + { + "epoch": 0.6497678737233055, + "grad_norm": 1.662032127380371, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8592860102653503, + "num_tokens": 127547167.0, + "step": 3499 + }, + { + "epoch": 0.6499535747446611, + "grad_norm": 1.578939437866211, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8755224943161011, + "num_tokens": 127580338.0, + "step": 3500 + }, + { + "epoch": 0.6501392757660167, + "grad_norm": 1.5776737928390503, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8795652389526367, + "num_tokens": 127611096.0, + "step": 3501 + }, + { + "epoch": 0.6503249767873723, + "grad_norm": 1.518518090248108, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8633169531822205, + "num_tokens": 127647817.0, + "step": 3502 + }, + { + "epoch": 0.650510677808728, + "grad_norm": 1.5311623811721802, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8498379588127136, + "num_tokens": 127687242.0, + "step": 3503 + }, + { + "epoch": 0.6506963788300836, + "grad_norm": 1.4347234964370728, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8540765047073364, + "num_tokens": 127728994.0, + "step": 3504 + }, + { + "epoch": 0.6508820798514392, + "grad_norm": 1.3782505989074707, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8592253923416138, + "num_tokens": 127775292.0, + "step": 3505 + }, + { + "epoch": 0.6510677808727948, + "grad_norm": 1.5543806552886963, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8644630908966064, + "num_tokens": 127810083.0, + "step": 3506 + }, + { + "epoch": 0.6512534818941504, + "grad_norm": 1.6180437803268433, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8540125489234924, + "num_tokens": 127845098.0, + "step": 3507 + }, + { + "epoch": 0.6514391829155061, + "grad_norm": 1.5031296014785767, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8876535892486572, + "num_tokens": 127878741.0, + "step": 3508 + }, + { + "epoch": 0.6516248839368617, + "grad_norm": 1.495770812034607, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8627381920814514, + "num_tokens": 127915075.0, + "step": 3509 + }, + { + "epoch": 0.6518105849582173, + "grad_norm": 1.4939472675323486, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8608331680297852, + "num_tokens": 127956682.0, + "step": 3510 + }, + { + "epoch": 0.6519962859795729, + "grad_norm": 1.642989158630371, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8618648648262024, + "num_tokens": 127989796.0, + "step": 3511 + }, + { + "epoch": 0.6521819870009286, + "grad_norm": 1.5573002099990845, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8505861163139343, + "num_tokens": 128025375.0, + "step": 3512 + }, + { + "epoch": 0.6523676880222842, + "grad_norm": 1.4827628135681152, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8743853569030762, + "num_tokens": 128059326.0, + "step": 3513 + }, + { + "epoch": 0.6525533890436397, + "grad_norm": 1.5754358768463135, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8665364384651184, + "num_tokens": 128093322.0, + "step": 3514 + }, + { + "epoch": 0.6527390900649953, + "grad_norm": 1.4972021579742432, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8726925849914551, + "num_tokens": 128129889.0, + "step": 3515 + }, + { + "epoch": 0.6529247910863509, + "grad_norm": 1.4429172277450562, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8672752380371094, + "num_tokens": 128168467.0, + "step": 3516 + }, + { + "epoch": 0.6531104921077066, + "grad_norm": 1.5396782159805298, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8508288264274597, + "num_tokens": 128207418.0, + "step": 3517 + }, + { + "epoch": 0.6532961931290622, + "grad_norm": 1.5606249570846558, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8711727857589722, + "num_tokens": 128241400.0, + "step": 3518 + }, + { + "epoch": 0.6534818941504178, + "grad_norm": 1.6360794305801392, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8601403832435608, + "num_tokens": 128273877.0, + "step": 3519 + }, + { + "epoch": 0.6536675951717734, + "grad_norm": 1.429955005645752, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8671141266822815, + "num_tokens": 128313052.0, + "step": 3520 + }, + { + "epoch": 0.653853296193129, + "grad_norm": 1.5642598867416382, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.874414324760437, + "num_tokens": 128346150.0, + "step": 3521 + }, + { + "epoch": 0.6540389972144847, + "grad_norm": 1.4896841049194336, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8640846610069275, + "num_tokens": 128386949.0, + "step": 3522 + }, + { + "epoch": 0.6542246982358403, + "grad_norm": 1.6481225490570068, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8760514259338379, + "num_tokens": 128419792.0, + "step": 3523 + }, + { + "epoch": 0.6544103992571959, + "grad_norm": 1.4038114547729492, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8844850063323975, + "num_tokens": 128457947.0, + "step": 3524 + }, + { + "epoch": 0.6545961002785515, + "grad_norm": 1.4829305410385132, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8817489147186279, + "num_tokens": 128494849.0, + "step": 3525 + }, + { + "epoch": 0.6547818012999072, + "grad_norm": 1.5614659786224365, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.856288492679596, + "num_tokens": 128534251.0, + "step": 3526 + }, + { + "epoch": 0.6549675023212628, + "grad_norm": 1.5568349361419678, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.874335765838623, + "num_tokens": 128568099.0, + "step": 3527 + }, + { + "epoch": 0.6551532033426184, + "grad_norm": 1.5159449577331543, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8723049759864807, + "num_tokens": 128604343.0, + "step": 3528 + }, + { + "epoch": 0.655338904363974, + "grad_norm": 1.492767333984375, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8663424849510193, + "num_tokens": 128644869.0, + "step": 3529 + }, + { + "epoch": 0.6555246053853296, + "grad_norm": 1.6033083200454712, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8635098338127136, + "num_tokens": 128676285.0, + "step": 3530 + }, + { + "epoch": 0.6557103064066853, + "grad_norm": 1.5445985794067383, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8585478663444519, + "num_tokens": 128712835.0, + "step": 3531 + }, + { + "epoch": 0.6558960074280409, + "grad_norm": 1.5919787883758545, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8652535676956177, + "num_tokens": 128747136.0, + "step": 3532 + }, + { + "epoch": 0.6560817084493965, + "grad_norm": 1.546260118484497, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8499096632003784, + "num_tokens": 128786078.0, + "step": 3533 + }, + { + "epoch": 0.6562674094707521, + "grad_norm": 1.5808569192886353, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8612306118011475, + "num_tokens": 128819708.0, + "step": 3534 + }, + { + "epoch": 0.6564531104921077, + "grad_norm": 1.6093519926071167, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8613361120223999, + "num_tokens": 128853452.0, + "step": 3535 + }, + { + "epoch": 0.6566388115134634, + "grad_norm": 1.7013399600982666, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8703634142875671, + "num_tokens": 128884942.0, + "step": 3536 + }, + { + "epoch": 0.656824512534819, + "grad_norm": 1.4749032258987427, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8686301708221436, + "num_tokens": 128923937.0, + "step": 3537 + }, + { + "epoch": 0.6570102135561745, + "grad_norm": 1.6293600797653198, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8433690071105957, + "num_tokens": 128958921.0, + "step": 3538 + }, + { + "epoch": 0.6571959145775301, + "grad_norm": 1.4071542024612427, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8646591901779175, + "num_tokens": 128999986.0, + "step": 3539 + }, + { + "epoch": 0.6573816155988857, + "grad_norm": 1.5979915857315063, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8680385947227478, + "num_tokens": 129032837.0, + "step": 3540 + }, + { + "epoch": 0.6575673166202414, + "grad_norm": 1.5046300888061523, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8740371465682983, + "num_tokens": 129075010.0, + "step": 3541 + }, + { + "epoch": 0.657753017641597, + "grad_norm": 1.5412373542785645, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.877240777015686, + "num_tokens": 129110420.0, + "step": 3542 + }, + { + "epoch": 0.6579387186629526, + "grad_norm": 1.6285568475723267, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8459765911102295, + "num_tokens": 129147158.0, + "step": 3543 + }, + { + "epoch": 0.6581244196843082, + "grad_norm": 1.470805048942566, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8814353942871094, + "num_tokens": 129181141.0, + "step": 3544 + }, + { + "epoch": 0.6583101207056639, + "grad_norm": 1.549539566040039, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.880291223526001, + "num_tokens": 129212183.0, + "step": 3545 + }, + { + "epoch": 0.6584958217270195, + "grad_norm": 1.7430009841918945, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8626542091369629, + "num_tokens": 129240130.0, + "step": 3546 + }, + { + "epoch": 0.6586815227483751, + "grad_norm": 1.549743413925171, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8589964509010315, + "num_tokens": 129276733.0, + "step": 3547 + }, + { + "epoch": 0.6588672237697307, + "grad_norm": 1.4982588291168213, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8657192587852478, + "num_tokens": 129315561.0, + "step": 3548 + }, + { + "epoch": 0.6590529247910863, + "grad_norm": 1.3691762685775757, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8712801933288574, + "num_tokens": 129359088.0, + "step": 3549 + }, + { + "epoch": 0.659238625812442, + "grad_norm": 1.5902820825576782, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8759138584136963, + "num_tokens": 129392104.0, + "step": 3550 + }, + { + "epoch": 0.6594243268337976, + "grad_norm": 1.7643847465515137, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8567095994949341, + "num_tokens": 129421644.0, + "step": 3551 + }, + { + "epoch": 0.6596100278551532, + "grad_norm": 1.6045989990234375, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8641752600669861, + "num_tokens": 129459742.0, + "step": 3552 + }, + { + "epoch": 0.6597957288765088, + "grad_norm": 1.6205743551254272, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.866376519203186, + "num_tokens": 129491757.0, + "step": 3553 + }, + { + "epoch": 0.6599814298978645, + "grad_norm": 1.4314149618148804, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8781938552856445, + "num_tokens": 129530698.0, + "step": 3554 + }, + { + "epoch": 0.6601671309192201, + "grad_norm": 1.6433244943618774, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8623226284980774, + "num_tokens": 129565498.0, + "step": 3555 + }, + { + "epoch": 0.6603528319405757, + "grad_norm": 1.548231601715088, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8465379476547241, + "num_tokens": 129604627.0, + "step": 3556 + }, + { + "epoch": 0.6605385329619313, + "grad_norm": 1.5417132377624512, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.857648491859436, + "num_tokens": 129642604.0, + "step": 3557 + }, + { + "epoch": 0.660724233983287, + "grad_norm": 1.5044820308685303, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8657496571540833, + "num_tokens": 129677504.0, + "step": 3558 + }, + { + "epoch": 0.6609099350046426, + "grad_norm": 1.4861170053482056, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8627204298973083, + "num_tokens": 129716703.0, + "step": 3559 + }, + { + "epoch": 0.6610956360259982, + "grad_norm": 1.5852811336517334, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8456450700759888, + "num_tokens": 129755896.0, + "step": 3560 + }, + { + "epoch": 0.6612813370473538, + "grad_norm": 1.653814435005188, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8610742092132568, + "num_tokens": 129788020.0, + "step": 3561 + }, + { + "epoch": 0.6614670380687093, + "grad_norm": 1.541337251663208, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8518736362457275, + "num_tokens": 129824891.0, + "step": 3562 + }, + { + "epoch": 0.661652739090065, + "grad_norm": 1.540764570236206, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8534461259841919, + "num_tokens": 129859686.0, + "step": 3563 + }, + { + "epoch": 0.6618384401114206, + "grad_norm": 1.6044342517852783, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8676558136940002, + "num_tokens": 129892189.0, + "step": 3564 + }, + { + "epoch": 0.6620241411327762, + "grad_norm": 1.581939697265625, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8760811686515808, + "num_tokens": 129925577.0, + "step": 3565 + }, + { + "epoch": 0.6622098421541318, + "grad_norm": 1.4207872152328491, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8692302107810974, + "num_tokens": 129967865.0, + "step": 3566 + }, + { + "epoch": 0.6623955431754874, + "grad_norm": 2.040072441101074, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8602825999259949, + "num_tokens": 130001152.0, + "step": 3567 + }, + { + "epoch": 0.662581244196843, + "grad_norm": 1.5615718364715576, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8685438632965088, + "num_tokens": 130038101.0, + "step": 3568 + }, + { + "epoch": 0.6627669452181987, + "grad_norm": 1.5373727083206177, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8627321720123291, + "num_tokens": 130073985.0, + "step": 3569 + }, + { + "epoch": 0.6629526462395543, + "grad_norm": 1.469835638999939, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.877922534942627, + "num_tokens": 130110673.0, + "step": 3570 + }, + { + "epoch": 0.6631383472609099, + "grad_norm": 1.5851508378982544, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8721593022346497, + "num_tokens": 130143844.0, + "step": 3571 + }, + { + "epoch": 0.6633240482822655, + "grad_norm": 1.638346791267395, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8576087951660156, + "num_tokens": 130179349.0, + "step": 3572 + }, + { + "epoch": 0.6635097493036212, + "grad_norm": 1.4939327239990234, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8671497702598572, + "num_tokens": 130216565.0, + "step": 3573 + }, + { + "epoch": 0.6636954503249768, + "grad_norm": 1.4688538312911987, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.864385187625885, + "num_tokens": 130258874.0, + "step": 3574 + }, + { + "epoch": 0.6638811513463324, + "grad_norm": 1.5003993511199951, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8621512651443481, + "num_tokens": 130296866.0, + "step": 3575 + }, + { + "epoch": 0.664066852367688, + "grad_norm": 1.5274658203125, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8689877986907959, + "num_tokens": 130332574.0, + "step": 3576 + }, + { + "epoch": 0.6642525533890437, + "grad_norm": 1.510911464691162, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8656433820724487, + "num_tokens": 130372178.0, + "step": 3577 + }, + { + "epoch": 0.6644382544103993, + "grad_norm": 1.5811690092086792, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8586101531982422, + "num_tokens": 130407542.0, + "step": 3578 + }, + { + "epoch": 0.6646239554317549, + "grad_norm": 1.3353638648986816, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8735847473144531, + "num_tokens": 130453904.0, + "step": 3579 + }, + { + "epoch": 0.6648096564531105, + "grad_norm": 1.4482098817825317, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8753281831741333, + "num_tokens": 130491848.0, + "step": 3580 + }, + { + "epoch": 0.6649953574744661, + "grad_norm": 1.530303716659546, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8545657396316528, + "num_tokens": 130530189.0, + "step": 3581 + }, + { + "epoch": 0.6651810584958218, + "grad_norm": 1.481988787651062, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8617087602615356, + "num_tokens": 130570536.0, + "step": 3582 + }, + { + "epoch": 0.6653667595171774, + "grad_norm": 1.6197832822799683, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8703923225402832, + "num_tokens": 130603503.0, + "step": 3583 + }, + { + "epoch": 0.665552460538533, + "grad_norm": 1.647364854812622, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8575082421302795, + "num_tokens": 130641414.0, + "step": 3584 + }, + { + "epoch": 0.6657381615598886, + "grad_norm": 1.6006888151168823, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8673146963119507, + "num_tokens": 130675306.0, + "step": 3585 + }, + { + "epoch": 0.6659238625812441, + "grad_norm": 1.5601928234100342, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.868030309677124, + "num_tokens": 130711855.0, + "step": 3586 + }, + { + "epoch": 0.6661095636025998, + "grad_norm": 1.5797944068908691, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8717708587646484, + "num_tokens": 130745511.0, + "step": 3587 + }, + { + "epoch": 0.6662952646239554, + "grad_norm": 1.4665637016296387, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8836503028869629, + "num_tokens": 130781583.0, + "step": 3588 + }, + { + "epoch": 0.666480965645311, + "grad_norm": 1.489782691001892, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8586472868919373, + "num_tokens": 130819112.0, + "step": 3589 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.4651687145233154, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8809615969657898, + "num_tokens": 130854792.0, + "step": 3590 + }, + { + "epoch": 0.6668523676880223, + "grad_norm": 1.6555877923965454, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8626259565353394, + "num_tokens": 130890191.0, + "step": 3591 + }, + { + "epoch": 0.6670380687093779, + "grad_norm": 1.5851483345031738, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8578649759292603, + "num_tokens": 130924760.0, + "step": 3592 + }, + { + "epoch": 0.6672237697307335, + "grad_norm": 1.647447943687439, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8630296587944031, + "num_tokens": 130957318.0, + "step": 3593 + }, + { + "epoch": 0.6674094707520891, + "grad_norm": 1.643794059753418, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8755331039428711, + "num_tokens": 130985834.0, + "step": 3594 + }, + { + "epoch": 0.6675951717734447, + "grad_norm": 1.4439812898635864, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8723598718643188, + "num_tokens": 131026869.0, + "step": 3595 + }, + { + "epoch": 0.6677808727948004, + "grad_norm": 1.5476583242416382, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8780967593193054, + "num_tokens": 131060298.0, + "step": 3596 + }, + { + "epoch": 0.667966573816156, + "grad_norm": 1.4617124795913696, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.856722354888916, + "num_tokens": 131101866.0, + "step": 3597 + }, + { + "epoch": 0.6681522748375116, + "grad_norm": 1.5203279256820679, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.868242084980011, + "num_tokens": 131137900.0, + "step": 3598 + }, + { + "epoch": 0.6683379758588672, + "grad_norm": 1.6978238821029663, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8671003580093384, + "num_tokens": 131173578.0, + "step": 3599 + }, + { + "epoch": 0.6685236768802229, + "grad_norm": 1.509474277496338, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8636170625686646, + "num_tokens": 131210713.0, + "step": 3600 + }, + { + "epoch": 0.6687093779015785, + "grad_norm": 1.4879271984100342, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8717021942138672, + "num_tokens": 131247423.0, + "step": 3601 + }, + { + "epoch": 0.6688950789229341, + "grad_norm": 1.4662407636642456, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8767200708389282, + "num_tokens": 131283146.0, + "step": 3602 + }, + { + "epoch": 0.6690807799442897, + "grad_norm": 1.5800164937973022, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8629586696624756, + "num_tokens": 131318337.0, + "step": 3603 + }, + { + "epoch": 0.6692664809656453, + "grad_norm": 1.4895739555358887, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8740049600601196, + "num_tokens": 131354875.0, + "step": 3604 + }, + { + "epoch": 0.669452181987001, + "grad_norm": 1.672701358795166, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.870258629322052, + "num_tokens": 131392812.0, + "step": 3605 + }, + { + "epoch": 0.6696378830083566, + "grad_norm": 1.5004220008850098, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8623468279838562, + "num_tokens": 131431786.0, + "step": 3606 + }, + { + "epoch": 0.6698235840297122, + "grad_norm": 1.4198391437530518, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8754353523254395, + "num_tokens": 131471474.0, + "step": 3607 + }, + { + "epoch": 0.6700092850510678, + "grad_norm": 1.4960676431655884, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8542433977127075, + "num_tokens": 131513872.0, + "step": 3608 + }, + { + "epoch": 0.6701949860724234, + "grad_norm": 1.4872925281524658, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8566849231719971, + "num_tokens": 131552358.0, + "step": 3609 + }, + { + "epoch": 0.6703806870937791, + "grad_norm": 1.8320637941360474, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8565571308135986, + "num_tokens": 131579851.0, + "step": 3610 + }, + { + "epoch": 0.6705663881151346, + "grad_norm": 1.5108251571655273, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8822590708732605, + "num_tokens": 131614370.0, + "step": 3611 + }, + { + "epoch": 0.6707520891364902, + "grad_norm": 1.5230995416641235, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8595955967903137, + "num_tokens": 131654689.0, + "step": 3612 + }, + { + "epoch": 0.6709377901578458, + "grad_norm": 1.5397231578826904, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8533596992492676, + "num_tokens": 131691246.0, + "step": 3613 + }, + { + "epoch": 0.6711234911792014, + "grad_norm": 1.7118041515350342, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8724594116210938, + "num_tokens": 131729148.0, + "step": 3614 + }, + { + "epoch": 0.6713091922005571, + "grad_norm": 1.6665523052215576, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.85639488697052, + "num_tokens": 131765736.0, + "step": 3615 + }, + { + "epoch": 0.6714948932219127, + "grad_norm": 1.486554503440857, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8628475069999695, + "num_tokens": 131805766.0, + "step": 3616 + }, + { + "epoch": 0.6716805942432683, + "grad_norm": 1.6809873580932617, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8644453287124634, + "num_tokens": 131838519.0, + "step": 3617 + }, + { + "epoch": 0.6718662952646239, + "grad_norm": 1.5990382432937622, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8600565791130066, + "num_tokens": 131876445.0, + "step": 3618 + }, + { + "epoch": 0.6720519962859796, + "grad_norm": 1.5674502849578857, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8580479621887207, + "num_tokens": 131912644.0, + "step": 3619 + }, + { + "epoch": 0.6722376973073352, + "grad_norm": 1.537986397743225, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8736357688903809, + "num_tokens": 131949502.0, + "step": 3620 + }, + { + "epoch": 0.6724233983286908, + "grad_norm": 1.7020446062088013, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.871265172958374, + "num_tokens": 131981508.0, + "step": 3621 + }, + { + "epoch": 0.6726090993500464, + "grad_norm": 1.439034104347229, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8623802661895752, + "num_tokens": 132021544.0, + "step": 3622 + }, + { + "epoch": 0.672794800371402, + "grad_norm": 1.537822961807251, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8607611656188965, + "num_tokens": 132059316.0, + "step": 3623 + }, + { + "epoch": 0.6729805013927577, + "grad_norm": 1.4841400384902954, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8583357930183411, + "num_tokens": 132098607.0, + "step": 3624 + }, + { + "epoch": 0.6731662024141133, + "grad_norm": 1.4427388906478882, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8748812675476074, + "num_tokens": 132137029.0, + "step": 3625 + }, + { + "epoch": 0.6733519034354689, + "grad_norm": 1.4883220195770264, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8758116364479065, + "num_tokens": 132171833.0, + "step": 3626 + }, + { + "epoch": 0.6735376044568245, + "grad_norm": 1.5698665380477905, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8640365600585938, + "num_tokens": 132206324.0, + "step": 3627 + }, + { + "epoch": 0.6737233054781802, + "grad_norm": 1.372302770614624, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8712209463119507, + "num_tokens": 132249114.0, + "step": 3628 + }, + { + "epoch": 0.6739090064995358, + "grad_norm": 1.4908089637756348, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8639391660690308, + "num_tokens": 132285914.0, + "step": 3629 + }, + { + "epoch": 0.6740947075208914, + "grad_norm": 1.5256588459014893, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.857151985168457, + "num_tokens": 132322725.0, + "step": 3630 + }, + { + "epoch": 0.674280408542247, + "grad_norm": 1.5871658325195312, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8514115810394287, + "num_tokens": 132363544.0, + "step": 3631 + }, + { + "epoch": 0.6744661095636026, + "grad_norm": 1.633810043334961, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8508023023605347, + "num_tokens": 132397399.0, + "step": 3632 + }, + { + "epoch": 0.6746518105849583, + "grad_norm": 1.6513890027999878, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8530114889144897, + "num_tokens": 132430503.0, + "step": 3633 + }, + { + "epoch": 0.6748375116063139, + "grad_norm": 1.5314894914627075, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8544990420341492, + "num_tokens": 132466967.0, + "step": 3634 + }, + { + "epoch": 0.6750232126276694, + "grad_norm": 1.4383056163787842, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8688737154006958, + "num_tokens": 132503491.0, + "step": 3635 + }, + { + "epoch": 0.675208913649025, + "grad_norm": 1.8113951683044434, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8602567315101624, + "num_tokens": 132533808.0, + "step": 3636 + }, + { + "epoch": 0.6753946146703806, + "grad_norm": 1.6197675466537476, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8513462543487549, + "num_tokens": 132568755.0, + "step": 3637 + }, + { + "epoch": 0.6755803156917363, + "grad_norm": 1.5325582027435303, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8727502226829529, + "num_tokens": 132605805.0, + "step": 3638 + }, + { + "epoch": 0.6757660167130919, + "grad_norm": 1.4346587657928467, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8490453958511353, + "num_tokens": 132653592.0, + "step": 3639 + }, + { + "epoch": 0.6759517177344475, + "grad_norm": 1.5285528898239136, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8548946976661682, + "num_tokens": 132694743.0, + "step": 3640 + }, + { + "epoch": 0.6761374187558031, + "grad_norm": 1.532366156578064, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.857341468334198, + "num_tokens": 132731821.0, + "step": 3641 + }, + { + "epoch": 0.6763231197771588, + "grad_norm": 1.4609354734420776, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8640316724777222, + "num_tokens": 132770772.0, + "step": 3642 + }, + { + "epoch": 0.6765088207985144, + "grad_norm": 1.4038007259368896, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8544902205467224, + "num_tokens": 132815276.0, + "step": 3643 + }, + { + "epoch": 0.67669452181987, + "grad_norm": 1.4870370626449585, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8553107976913452, + "num_tokens": 132858679.0, + "step": 3644 + }, + { + "epoch": 0.6768802228412256, + "grad_norm": 1.4002447128295898, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8826260566711426, + "num_tokens": 132901242.0, + "step": 3645 + }, + { + "epoch": 0.6770659238625812, + "grad_norm": 1.5662628412246704, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.856498658657074, + "num_tokens": 132935789.0, + "step": 3646 + }, + { + "epoch": 0.6772516248839369, + "grad_norm": 1.3575125932693481, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8581836223602295, + "num_tokens": 132980932.0, + "step": 3647 + }, + { + "epoch": 0.6774373259052925, + "grad_norm": 1.505725622177124, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8748352527618408, + "num_tokens": 133016320.0, + "step": 3648 + }, + { + "epoch": 0.6776230269266481, + "grad_norm": 1.3978126049041748, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8605808615684509, + "num_tokens": 133058835.0, + "step": 3649 + }, + { + "epoch": 0.6778087279480037, + "grad_norm": 1.4807732105255127, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8577039241790771, + "num_tokens": 133098725.0, + "step": 3650 + }, + { + "epoch": 0.6779944289693594, + "grad_norm": 1.5283821821212769, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8726662993431091, + "num_tokens": 133139093.0, + "step": 3651 + }, + { + "epoch": 0.678180129990715, + "grad_norm": 1.5154386758804321, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8857313990592957, + "num_tokens": 133171971.0, + "step": 3652 + }, + { + "epoch": 0.6783658310120706, + "grad_norm": 1.581809639930725, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8743649125099182, + "num_tokens": 133211120.0, + "step": 3653 + }, + { + "epoch": 0.6785515320334262, + "grad_norm": 1.4470794200897217, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8610801100730896, + "num_tokens": 133251354.0, + "step": 3654 + }, + { + "epoch": 0.6787372330547818, + "grad_norm": 1.4802786111831665, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8737682104110718, + "num_tokens": 133286318.0, + "step": 3655 + }, + { + "epoch": 0.6789229340761375, + "grad_norm": 1.7462321519851685, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.864458441734314, + "num_tokens": 133317222.0, + "step": 3656 + }, + { + "epoch": 0.6791086350974931, + "grad_norm": 1.5225303173065186, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8775213956832886, + "num_tokens": 133353084.0, + "step": 3657 + }, + { + "epoch": 0.6792943361188487, + "grad_norm": 1.5913087129592896, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8678655624389648, + "num_tokens": 133384074.0, + "step": 3658 + }, + { + "epoch": 0.6794800371402042, + "grad_norm": 1.4799729585647583, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8735881447792053, + "num_tokens": 133422692.0, + "step": 3659 + }, + { + "epoch": 0.6796657381615598, + "grad_norm": 1.5297340154647827, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8631551861763, + "num_tokens": 133459763.0, + "step": 3660 + }, + { + "epoch": 0.6798514391829155, + "grad_norm": 1.6965947151184082, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8513258099555969, + "num_tokens": 133491850.0, + "step": 3661 + }, + { + "epoch": 0.6800371402042711, + "grad_norm": 1.5499628782272339, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8594686985015869, + "num_tokens": 133528804.0, + "step": 3662 + }, + { + "epoch": 0.6802228412256267, + "grad_norm": 1.627995252609253, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8552975654602051, + "num_tokens": 133563405.0, + "step": 3663 + }, + { + "epoch": 0.6804085422469823, + "grad_norm": 1.484265923500061, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8761782646179199, + "num_tokens": 133598042.0, + "step": 3664 + }, + { + "epoch": 0.680594243268338, + "grad_norm": 1.6754577159881592, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8639839887619019, + "num_tokens": 133630289.0, + "step": 3665 + }, + { + "epoch": 0.6807799442896936, + "grad_norm": 1.5243269205093384, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8647776246070862, + "num_tokens": 133670092.0, + "step": 3666 + }, + { + "epoch": 0.6809656453110492, + "grad_norm": 1.5163789987564087, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8666732311248779, + "num_tokens": 133705198.0, + "step": 3667 + }, + { + "epoch": 0.6811513463324048, + "grad_norm": 1.6119649410247803, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8678403496742249, + "num_tokens": 133740478.0, + "step": 3668 + }, + { + "epoch": 0.6813370473537604, + "grad_norm": 1.661653995513916, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8641873598098755, + "num_tokens": 133773600.0, + "step": 3669 + }, + { + "epoch": 0.6815227483751161, + "grad_norm": 1.5075260400772095, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.859533429145813, + "num_tokens": 133812756.0, + "step": 3670 + }, + { + "epoch": 0.6817084493964717, + "grad_norm": 1.3930537700653076, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8835999965667725, + "num_tokens": 133852595.0, + "step": 3671 + }, + { + "epoch": 0.6818941504178273, + "grad_norm": 1.479569911956787, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8714852333068848, + "num_tokens": 133890750.0, + "step": 3672 + }, + { + "epoch": 0.6820798514391829, + "grad_norm": 1.5794767141342163, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.862730860710144, + "num_tokens": 133926143.0, + "step": 3673 + }, + { + "epoch": 0.6822655524605385, + "grad_norm": 1.4676578044891357, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8787656426429749, + "num_tokens": 133961264.0, + "step": 3674 + }, + { + "epoch": 0.6824512534818942, + "grad_norm": 1.597304105758667, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.864075779914856, + "num_tokens": 133997131.0, + "step": 3675 + }, + { + "epoch": 0.6826369545032498, + "grad_norm": 1.4215288162231445, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8680030107498169, + "num_tokens": 134039002.0, + "step": 3676 + }, + { + "epoch": 0.6828226555246054, + "grad_norm": 1.5660432577133179, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8442972302436829, + "num_tokens": 134078151.0, + "step": 3677 + }, + { + "epoch": 0.683008356545961, + "grad_norm": 1.5932337045669556, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8641006946563721, + "num_tokens": 134110481.0, + "step": 3678 + }, + { + "epoch": 0.6831940575673167, + "grad_norm": 1.47759211063385, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.864594578742981, + "num_tokens": 134150109.0, + "step": 3679 + }, + { + "epoch": 0.6833797585886723, + "grad_norm": 1.6414144039154053, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8686192035675049, + "num_tokens": 134187415.0, + "step": 3680 + }, + { + "epoch": 0.6835654596100279, + "grad_norm": 1.4147216081619263, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8752763271331787, + "num_tokens": 134229830.0, + "step": 3681 + }, + { + "epoch": 0.6837511606313835, + "grad_norm": 1.6866083145141602, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.84811931848526, + "num_tokens": 134263690.0, + "step": 3682 + }, + { + "epoch": 0.683936861652739, + "grad_norm": 1.4651727676391602, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.883985161781311, + "num_tokens": 134299270.0, + "step": 3683 + }, + { + "epoch": 0.6841225626740947, + "grad_norm": 1.5955902338027954, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8597695231437683, + "num_tokens": 134335281.0, + "step": 3684 + }, + { + "epoch": 0.6843082636954503, + "grad_norm": 1.6048164367675781, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8652902245521545, + "num_tokens": 134370232.0, + "step": 3685 + }, + { + "epoch": 0.6844939647168059, + "grad_norm": 1.6283243894577026, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8701039552688599, + "num_tokens": 134401119.0, + "step": 3686 + }, + { + "epoch": 0.6846796657381615, + "grad_norm": 1.5935865640640259, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8584082126617432, + "num_tokens": 134436214.0, + "step": 3687 + }, + { + "epoch": 0.6848653667595171, + "grad_norm": 1.5270419120788574, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.872140645980835, + "num_tokens": 134470821.0, + "step": 3688 + }, + { + "epoch": 0.6850510677808728, + "grad_norm": 1.4783082008361816, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.858752965927124, + "num_tokens": 134511761.0, + "step": 3689 + }, + { + "epoch": 0.6852367688022284, + "grad_norm": 1.580652117729187, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8634860515594482, + "num_tokens": 134545506.0, + "step": 3690 + }, + { + "epoch": 0.685422469823584, + "grad_norm": 1.5947608947753906, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.849368691444397, + "num_tokens": 134580791.0, + "step": 3691 + }, + { + "epoch": 0.6856081708449396, + "grad_norm": 1.5813652276992798, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8637186288833618, + "num_tokens": 134616195.0, + "step": 3692 + }, + { + "epoch": 0.6857938718662953, + "grad_norm": 1.5872983932495117, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8657174110412598, + "num_tokens": 134649754.0, + "step": 3693 + }, + { + "epoch": 0.6859795728876509, + "grad_norm": 1.641448736190796, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8664953112602234, + "num_tokens": 134684750.0, + "step": 3694 + }, + { + "epoch": 0.6861652739090065, + "grad_norm": 1.4940695762634277, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8656379580497742, + "num_tokens": 134721513.0, + "step": 3695 + }, + { + "epoch": 0.6863509749303621, + "grad_norm": 1.4522440433502197, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8717488646507263, + "num_tokens": 134760886.0, + "step": 3696 + }, + { + "epoch": 0.6865366759517177, + "grad_norm": 1.5570615530014038, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8601444959640503, + "num_tokens": 134795660.0, + "step": 3697 + }, + { + "epoch": 0.6867223769730734, + "grad_norm": 1.4735205173492432, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8762982487678528, + "num_tokens": 134832695.0, + "step": 3698 + }, + { + "epoch": 0.686908077994429, + "grad_norm": 1.581355094909668, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8652397394180298, + "num_tokens": 134868489.0, + "step": 3699 + }, + { + "epoch": 0.6870937790157846, + "grad_norm": 1.5682833194732666, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8719614744186401, + "num_tokens": 134908059.0, + "step": 3700 + }, + { + "epoch": 0.6872794800371402, + "grad_norm": 1.7791646718978882, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8403758406639099, + "num_tokens": 134938349.0, + "step": 3701 + }, + { + "epoch": 0.6874651810584959, + "grad_norm": 1.7544599771499634, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8621591329574585, + "num_tokens": 134971551.0, + "step": 3702 + }, + { + "epoch": 0.6876508820798515, + "grad_norm": 1.6091063022613525, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8784371018409729, + "num_tokens": 135005806.0, + "step": 3703 + }, + { + "epoch": 0.6878365831012071, + "grad_norm": 1.5593249797821045, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8713880181312561, + "num_tokens": 135043624.0, + "step": 3704 + }, + { + "epoch": 0.6880222841225627, + "grad_norm": 1.581626057624817, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8536480665206909, + "num_tokens": 135081090.0, + "step": 3705 + }, + { + "epoch": 0.6882079851439183, + "grad_norm": 1.385196328163147, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8687897324562073, + "num_tokens": 135122536.0, + "step": 3706 + }, + { + "epoch": 0.6883936861652739, + "grad_norm": 1.7129356861114502, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8499996662139893, + "num_tokens": 135154613.0, + "step": 3707 + }, + { + "epoch": 0.6885793871866295, + "grad_norm": 1.637559413909912, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8512542843818665, + "num_tokens": 135192812.0, + "step": 3708 + }, + { + "epoch": 0.6887650882079851, + "grad_norm": 1.8058393001556396, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8659876585006714, + "num_tokens": 135226155.0, + "step": 3709 + }, + { + "epoch": 0.6889507892293407, + "grad_norm": 1.560416340827942, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8661354780197144, + "num_tokens": 135259818.0, + "step": 3710 + }, + { + "epoch": 0.6891364902506963, + "grad_norm": 1.5034441947937012, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8686118721961975, + "num_tokens": 135295681.0, + "step": 3711 + }, + { + "epoch": 0.689322191272052, + "grad_norm": 1.4447696208953857, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8692180514335632, + "num_tokens": 135337273.0, + "step": 3712 + }, + { + "epoch": 0.6895078922934076, + "grad_norm": 1.5224558115005493, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8691052794456482, + "num_tokens": 135370383.0, + "step": 3713 + }, + { + "epoch": 0.6896935933147632, + "grad_norm": 1.704383134841919, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8575767278671265, + "num_tokens": 135400835.0, + "step": 3714 + }, + { + "epoch": 0.6898792943361188, + "grad_norm": 1.5803923606872559, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8718007802963257, + "num_tokens": 135437320.0, + "step": 3715 + }, + { + "epoch": 0.6900649953574745, + "grad_norm": 1.3697714805603027, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8671305179595947, + "num_tokens": 135479731.0, + "step": 3716 + }, + { + "epoch": 0.6902506963788301, + "grad_norm": 1.4711886644363403, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8821636438369751, + "num_tokens": 135517258.0, + "step": 3717 + }, + { + "epoch": 0.6904363974001857, + "grad_norm": 1.5386337041854858, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.858278214931488, + "num_tokens": 135556272.0, + "step": 3718 + }, + { + "epoch": 0.6906220984215413, + "grad_norm": 1.4994696378707886, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8583470582962036, + "num_tokens": 135593483.0, + "step": 3719 + }, + { + "epoch": 0.6908077994428969, + "grad_norm": 1.6197445392608643, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8774709105491638, + "num_tokens": 135625475.0, + "step": 3720 + }, + { + "epoch": 0.6909935004642526, + "grad_norm": 1.5348691940307617, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8722072839736938, + "num_tokens": 135664807.0, + "step": 3721 + }, + { + "epoch": 0.6911792014856082, + "grad_norm": 1.6657414436340332, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8676671981811523, + "num_tokens": 135696473.0, + "step": 3722 + }, + { + "epoch": 0.6913649025069638, + "grad_norm": 1.490646481513977, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8643239140510559, + "num_tokens": 135735057.0, + "step": 3723 + }, + { + "epoch": 0.6915506035283194, + "grad_norm": 1.5112731456756592, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8445079326629639, + "num_tokens": 135774656.0, + "step": 3724 + }, + { + "epoch": 0.691736304549675, + "grad_norm": 1.629980444908142, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8695986270904541, + "num_tokens": 135810093.0, + "step": 3725 + }, + { + "epoch": 0.6919220055710307, + "grad_norm": 1.6165162324905396, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8558661341667175, + "num_tokens": 135847476.0, + "step": 3726 + }, + { + "epoch": 0.6921077065923863, + "grad_norm": 1.5194016695022583, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8625558018684387, + "num_tokens": 135881666.0, + "step": 3727 + }, + { + "epoch": 0.6922934076137419, + "grad_norm": 1.5403048992156982, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8434152603149414, + "num_tokens": 135919909.0, + "step": 3728 + }, + { + "epoch": 0.6924791086350975, + "grad_norm": 1.5248148441314697, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8627794981002808, + "num_tokens": 135957858.0, + "step": 3729 + }, + { + "epoch": 0.6926648096564532, + "grad_norm": 1.508332371711731, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8704265356063843, + "num_tokens": 135997031.0, + "step": 3730 + }, + { + "epoch": 0.6928505106778087, + "grad_norm": 1.4852920770645142, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8679066896438599, + "num_tokens": 136036008.0, + "step": 3731 + }, + { + "epoch": 0.6930362116991643, + "grad_norm": 1.5866488218307495, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8609254360198975, + "num_tokens": 136070278.0, + "step": 3732 + }, + { + "epoch": 0.6932219127205199, + "grad_norm": 1.5742267370224, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8610653281211853, + "num_tokens": 136110184.0, + "step": 3733 + }, + { + "epoch": 0.6934076137418755, + "grad_norm": 1.5853972434997559, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.866302490234375, + "num_tokens": 136142449.0, + "step": 3734 + }, + { + "epoch": 0.6935933147632312, + "grad_norm": 1.5900685787200928, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8745933771133423, + "num_tokens": 136175328.0, + "step": 3735 + }, + { + "epoch": 0.6937790157845868, + "grad_norm": 1.492797613143921, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8549733757972717, + "num_tokens": 136214333.0, + "step": 3736 + }, + { + "epoch": 0.6939647168059424, + "grad_norm": 1.6478289365768433, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8679728507995605, + "num_tokens": 136247373.0, + "step": 3737 + }, + { + "epoch": 0.694150417827298, + "grad_norm": 1.4837909936904907, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8638285398483276, + "num_tokens": 136285741.0, + "step": 3738 + }, + { + "epoch": 0.6943361188486536, + "grad_norm": 1.5197492837905884, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8724738359451294, + "num_tokens": 136328586.0, + "step": 3739 + }, + { + "epoch": 0.6945218198700093, + "grad_norm": 1.5838537216186523, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8690578937530518, + "num_tokens": 136364607.0, + "step": 3740 + }, + { + "epoch": 0.6947075208913649, + "grad_norm": 1.5487492084503174, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8629008531570435, + "num_tokens": 136400513.0, + "step": 3741 + }, + { + "epoch": 0.6948932219127205, + "grad_norm": 1.471898078918457, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8675737977027893, + "num_tokens": 136439165.0, + "step": 3742 + }, + { + "epoch": 0.6950789229340761, + "grad_norm": 1.5922768115997314, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8581616878509521, + "num_tokens": 136476005.0, + "step": 3743 + }, + { + "epoch": 0.6952646239554318, + "grad_norm": 1.5442161560058594, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.855576753616333, + "num_tokens": 136513386.0, + "step": 3744 + }, + { + "epoch": 0.6954503249767874, + "grad_norm": 1.4817670583724976, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8718637228012085, + "num_tokens": 136550607.0, + "step": 3745 + }, + { + "epoch": 0.695636025998143, + "grad_norm": 1.5651341676712036, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8486098051071167, + "num_tokens": 136588212.0, + "step": 3746 + }, + { + "epoch": 0.6958217270194986, + "grad_norm": 1.6532971858978271, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8683807849884033, + "num_tokens": 136621456.0, + "step": 3747 + }, + { + "epoch": 0.6960074280408542, + "grad_norm": 1.591729998588562, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8598529100418091, + "num_tokens": 136657136.0, + "step": 3748 + }, + { + "epoch": 0.6961931290622099, + "grad_norm": 1.6775761842727661, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8698428869247437, + "num_tokens": 136689570.0, + "step": 3749 + }, + { + "epoch": 0.6963788300835655, + "grad_norm": 1.4484515190124512, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.86986243724823, + "num_tokens": 136726304.0, + "step": 3750 + }, + { + "epoch": 0.6965645311049211, + "grad_norm": 1.5433768033981323, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8761190176010132, + "num_tokens": 136761767.0, + "step": 3751 + }, + { + "epoch": 0.6967502321262767, + "grad_norm": 1.4422636032104492, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8596220016479492, + "num_tokens": 136804674.0, + "step": 3752 + }, + { + "epoch": 0.6969359331476324, + "grad_norm": 1.5303276777267456, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8738002777099609, + "num_tokens": 136840887.0, + "step": 3753 + }, + { + "epoch": 0.697121634168988, + "grad_norm": 1.5016981363296509, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8699291348457336, + "num_tokens": 136876254.0, + "step": 3754 + }, + { + "epoch": 0.6973073351903436, + "grad_norm": 1.4877315759658813, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8517204523086548, + "num_tokens": 136918246.0, + "step": 3755 + }, + { + "epoch": 0.6974930362116991, + "grad_norm": 1.670133113861084, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8476866483688354, + "num_tokens": 136954549.0, + "step": 3756 + }, + { + "epoch": 0.6976787372330547, + "grad_norm": 1.5775244235992432, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.870681643486023, + "num_tokens": 136991906.0, + "step": 3757 + }, + { + "epoch": 0.6978644382544104, + "grad_norm": 1.5937130451202393, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8678668141365051, + "num_tokens": 137023496.0, + "step": 3758 + }, + { + "epoch": 0.698050139275766, + "grad_norm": 1.5917205810546875, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8692785501480103, + "num_tokens": 137054419.0, + "step": 3759 + }, + { + "epoch": 0.6982358402971216, + "grad_norm": 1.6388967037200928, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8482936024665833, + "num_tokens": 137088671.0, + "step": 3760 + }, + { + "epoch": 0.6984215413184772, + "grad_norm": 1.710476040840149, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8651249408721924, + "num_tokens": 137118303.0, + "step": 3761 + }, + { + "epoch": 0.6986072423398328, + "grad_norm": 1.4603503942489624, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8715126514434814, + "num_tokens": 137156867.0, + "step": 3762 + }, + { + "epoch": 0.6987929433611885, + "grad_norm": 1.4089823961257935, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8614232540130615, + "num_tokens": 137199303.0, + "step": 3763 + }, + { + "epoch": 0.6989786443825441, + "grad_norm": 1.549002766609192, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8584021329879761, + "num_tokens": 137234455.0, + "step": 3764 + }, + { + "epoch": 0.6991643454038997, + "grad_norm": 1.5221401453018188, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8585492372512817, + "num_tokens": 137275854.0, + "step": 3765 + }, + { + "epoch": 0.6993500464252553, + "grad_norm": 1.6069114208221436, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8666298389434814, + "num_tokens": 137309995.0, + "step": 3766 + }, + { + "epoch": 0.699535747446611, + "grad_norm": 1.8011730909347534, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8554432988166809, + "num_tokens": 137339976.0, + "step": 3767 + }, + { + "epoch": 0.6997214484679666, + "grad_norm": 1.5480252504348755, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8699831962585449, + "num_tokens": 137376774.0, + "step": 3768 + }, + { + "epoch": 0.6999071494893222, + "grad_norm": 1.474373459815979, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8632151484489441, + "num_tokens": 137414838.0, + "step": 3769 + }, + { + "epoch": 0.7000928505106778, + "grad_norm": 1.5919246673583984, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8764916658401489, + "num_tokens": 137448072.0, + "step": 3770 + }, + { + "epoch": 0.7002785515320334, + "grad_norm": 1.6605604887008667, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8579769730567932, + "num_tokens": 137485967.0, + "step": 3771 + }, + { + "epoch": 0.7004642525533891, + "grad_norm": 1.6545665264129639, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.852135181427002, + "num_tokens": 137522244.0, + "step": 3772 + }, + { + "epoch": 0.7006499535747447, + "grad_norm": 1.4576750993728638, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8589125275611877, + "num_tokens": 137564175.0, + "step": 3773 + }, + { + "epoch": 0.7008356545961003, + "grad_norm": 1.4454749822616577, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8661189675331116, + "num_tokens": 137606774.0, + "step": 3774 + }, + { + "epoch": 0.7010213556174559, + "grad_norm": 1.5061109066009521, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8695720434188843, + "num_tokens": 137644143.0, + "step": 3775 + }, + { + "epoch": 0.7012070566388116, + "grad_norm": 1.5788064002990723, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8566181659698486, + "num_tokens": 137679994.0, + "step": 3776 + }, + { + "epoch": 0.7013927576601672, + "grad_norm": 1.640184760093689, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8549176454544067, + "num_tokens": 137715646.0, + "step": 3777 + }, + { + "epoch": 0.7015784586815228, + "grad_norm": 1.56399667263031, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8704931735992432, + "num_tokens": 137750675.0, + "step": 3778 + }, + { + "epoch": 0.7017641597028784, + "grad_norm": 1.8033242225646973, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8557870984077454, + "num_tokens": 137782630.0, + "step": 3779 + }, + { + "epoch": 0.7019498607242339, + "grad_norm": 1.7026430368423462, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8377255797386169, + "num_tokens": 137815940.0, + "step": 3780 + }, + { + "epoch": 0.7021355617455896, + "grad_norm": 1.4952458143234253, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8700599670410156, + "num_tokens": 137851732.0, + "step": 3781 + }, + { + "epoch": 0.7023212627669452, + "grad_norm": 1.5200161933898926, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.865229606628418, + "num_tokens": 137886823.0, + "step": 3782 + }, + { + "epoch": 0.7025069637883008, + "grad_norm": 1.6041802167892456, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8603425025939941, + "num_tokens": 137924694.0, + "step": 3783 + }, + { + "epoch": 0.7026926648096564, + "grad_norm": 1.8383021354675293, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.857094943523407, + "num_tokens": 137951830.0, + "step": 3784 + }, + { + "epoch": 0.702878365831012, + "grad_norm": 1.5300939083099365, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8655508756637573, + "num_tokens": 137988580.0, + "step": 3785 + }, + { + "epoch": 0.7030640668523677, + "grad_norm": 1.554487705230713, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8689626455307007, + "num_tokens": 138023829.0, + "step": 3786 + }, + { + "epoch": 0.7032497678737233, + "grad_norm": 1.639054775238037, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8517553210258484, + "num_tokens": 138057513.0, + "step": 3787 + }, + { + "epoch": 0.7034354688950789, + "grad_norm": 1.4954816102981567, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8753940463066101, + "num_tokens": 138094059.0, + "step": 3788 + }, + { + "epoch": 0.7036211699164345, + "grad_norm": 1.584689736366272, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8626399040222168, + "num_tokens": 138130631.0, + "step": 3789 + }, + { + "epoch": 0.7038068709377902, + "grad_norm": 1.6959164142608643, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8640658855438232, + "num_tokens": 138159446.0, + "step": 3790 + }, + { + "epoch": 0.7039925719591458, + "grad_norm": 1.5621095895767212, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8501745462417603, + "num_tokens": 138199984.0, + "step": 3791 + }, + { + "epoch": 0.7041782729805014, + "grad_norm": 1.4634946584701538, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8726593852043152, + "num_tokens": 138236221.0, + "step": 3792 + }, + { + "epoch": 0.704363974001857, + "grad_norm": 1.5025441646575928, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8660494089126587, + "num_tokens": 138278773.0, + "step": 3793 + }, + { + "epoch": 0.7045496750232126, + "grad_norm": 1.5884653329849243, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8841571807861328, + "num_tokens": 138315461.0, + "step": 3794 + }, + { + "epoch": 0.7047353760445683, + "grad_norm": 1.492576003074646, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.875211238861084, + "num_tokens": 138351781.0, + "step": 3795 + }, + { + "epoch": 0.7049210770659239, + "grad_norm": 1.481785774230957, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8731926083564758, + "num_tokens": 138390574.0, + "step": 3796 + }, + { + "epoch": 0.7051067780872795, + "grad_norm": 1.493658185005188, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8740804195404053, + "num_tokens": 138430615.0, + "step": 3797 + }, + { + "epoch": 0.7052924791086351, + "grad_norm": 1.5789300203323364, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8674790859222412, + "num_tokens": 138464837.0, + "step": 3798 + }, + { + "epoch": 0.7054781801299908, + "grad_norm": 1.6789445877075195, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.872371256351471, + "num_tokens": 138494406.0, + "step": 3799 + }, + { + "epoch": 0.7056638811513464, + "grad_norm": 1.9799299240112305, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8663501739501953, + "num_tokens": 138525046.0, + "step": 3800 + }, + { + "epoch": 0.705849582172702, + "grad_norm": 1.5247108936309814, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8666881322860718, + "num_tokens": 138557672.0, + "step": 3801 + }, + { + "epoch": 0.7060352831940576, + "grad_norm": 1.6615102291107178, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8548798561096191, + "num_tokens": 138591311.0, + "step": 3802 + }, + { + "epoch": 0.7062209842154132, + "grad_norm": 1.4619920253753662, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8701863288879395, + "num_tokens": 138627753.0, + "step": 3803 + }, + { + "epoch": 0.7064066852367687, + "grad_norm": 1.5999547243118286, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8537325859069824, + "num_tokens": 138662578.0, + "step": 3804 + }, + { + "epoch": 0.7065923862581244, + "grad_norm": 1.5412362813949585, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8654470443725586, + "num_tokens": 138698145.0, + "step": 3805 + }, + { + "epoch": 0.70677808727948, + "grad_norm": 1.5145024061203003, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8636240363121033, + "num_tokens": 138735252.0, + "step": 3806 + }, + { + "epoch": 0.7069637883008356, + "grad_norm": 1.5328532457351685, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8655584454536438, + "num_tokens": 138771498.0, + "step": 3807 + }, + { + "epoch": 0.7071494893221912, + "grad_norm": 1.363314151763916, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8638869524002075, + "num_tokens": 138815962.0, + "step": 3808 + }, + { + "epoch": 0.7073351903435469, + "grad_norm": 1.4146391153335571, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8766030073165894, + "num_tokens": 138857291.0, + "step": 3809 + }, + { + "epoch": 0.7075208913649025, + "grad_norm": 1.4878791570663452, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8479578495025635, + "num_tokens": 138900103.0, + "step": 3810 + }, + { + "epoch": 0.7077065923862581, + "grad_norm": 1.5687251091003418, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.86715167760849, + "num_tokens": 138934704.0, + "step": 3811 + }, + { + "epoch": 0.7078922934076137, + "grad_norm": 1.418962001800537, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8629482388496399, + "num_tokens": 138978021.0, + "step": 3812 + }, + { + "epoch": 0.7080779944289693, + "grad_norm": 1.5672885179519653, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8618755340576172, + "num_tokens": 139015506.0, + "step": 3813 + }, + { + "epoch": 0.708263695450325, + "grad_norm": 1.5130023956298828, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8708889484405518, + "num_tokens": 139053278.0, + "step": 3814 + }, + { + "epoch": 0.7084493964716806, + "grad_norm": 1.4014126062393188, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.869297981262207, + "num_tokens": 139092113.0, + "step": 3815 + }, + { + "epoch": 0.7086350974930362, + "grad_norm": 1.460288166999817, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8726964592933655, + "num_tokens": 139129328.0, + "step": 3816 + }, + { + "epoch": 0.7088207985143918, + "grad_norm": 1.6152362823486328, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8623271584510803, + "num_tokens": 139162418.0, + "step": 3817 + }, + { + "epoch": 0.7090064995357475, + "grad_norm": 1.5750994682312012, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8748922348022461, + "num_tokens": 139194870.0, + "step": 3818 + }, + { + "epoch": 0.7091922005571031, + "grad_norm": 1.6370625495910645, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8584373593330383, + "num_tokens": 139230738.0, + "step": 3819 + }, + { + "epoch": 0.7093779015784587, + "grad_norm": 1.500206470489502, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8576788902282715, + "num_tokens": 139270958.0, + "step": 3820 + }, + { + "epoch": 0.7095636025998143, + "grad_norm": 1.5034782886505127, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8567012548446655, + "num_tokens": 139307787.0, + "step": 3821 + }, + { + "epoch": 0.70974930362117, + "grad_norm": 1.623816728591919, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8501105308532715, + "num_tokens": 139342933.0, + "step": 3822 + }, + { + "epoch": 0.7099350046425256, + "grad_norm": 1.445178747177124, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8806142210960388, + "num_tokens": 139381191.0, + "step": 3823 + }, + { + "epoch": 0.7101207056638812, + "grad_norm": 1.4842257499694824, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8782061338424683, + "num_tokens": 139420980.0, + "step": 3824 + }, + { + "epoch": 0.7103064066852368, + "grad_norm": 1.4010732173919678, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8814541101455688, + "num_tokens": 139462263.0, + "step": 3825 + }, + { + "epoch": 0.7104921077065924, + "grad_norm": 1.5415457487106323, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8714558482170105, + "num_tokens": 139497229.0, + "step": 3826 + }, + { + "epoch": 0.7106778087279481, + "grad_norm": 1.5876797437667847, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8728470802307129, + "num_tokens": 139530142.0, + "step": 3827 + }, + { + "epoch": 0.7108635097493036, + "grad_norm": 1.52223801612854, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8733214139938354, + "num_tokens": 139563465.0, + "step": 3828 + }, + { + "epoch": 0.7110492107706592, + "grad_norm": 1.4957914352416992, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8671905398368835, + "num_tokens": 139602938.0, + "step": 3829 + }, + { + "epoch": 0.7112349117920148, + "grad_norm": 1.5178099870681763, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8701310157775879, + "num_tokens": 139637074.0, + "step": 3830 + }, + { + "epoch": 0.7114206128133704, + "grad_norm": 1.4915237426757812, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8614524602890015, + "num_tokens": 139673096.0, + "step": 3831 + }, + { + "epoch": 0.7116063138347261, + "grad_norm": 1.4657953977584839, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.875199019908905, + "num_tokens": 139709286.0, + "step": 3832 + }, + { + "epoch": 0.7117920148560817, + "grad_norm": 1.445542335510254, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.875920295715332, + "num_tokens": 139746351.0, + "step": 3833 + }, + { + "epoch": 0.7119777158774373, + "grad_norm": 1.6723899841308594, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8696696162223816, + "num_tokens": 139778273.0, + "step": 3834 + }, + { + "epoch": 0.7121634168987929, + "grad_norm": 1.507331132888794, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8738466501235962, + "num_tokens": 139814344.0, + "step": 3835 + }, + { + "epoch": 0.7123491179201485, + "grad_norm": 1.4575563669204712, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8687645196914673, + "num_tokens": 139852238.0, + "step": 3836 + }, + { + "epoch": 0.7125348189415042, + "grad_norm": 1.604600191116333, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8589248657226562, + "num_tokens": 139884096.0, + "step": 3837 + }, + { + "epoch": 0.7127205199628598, + "grad_norm": 1.4895496368408203, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.870964527130127, + "num_tokens": 139919103.0, + "step": 3838 + }, + { + "epoch": 0.7129062209842154, + "grad_norm": 1.566516399383545, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8681177496910095, + "num_tokens": 139954101.0, + "step": 3839 + }, + { + "epoch": 0.713091922005571, + "grad_norm": 1.4828697443008423, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8560254573822021, + "num_tokens": 139997467.0, + "step": 3840 + }, + { + "epoch": 0.7132776230269267, + "grad_norm": 1.4032459259033203, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8704954385757446, + "num_tokens": 140039236.0, + "step": 3841 + }, + { + "epoch": 0.7134633240482823, + "grad_norm": 1.5376542806625366, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8693943619728088, + "num_tokens": 140074105.0, + "step": 3842 + }, + { + "epoch": 0.7136490250696379, + "grad_norm": 1.7433110475540161, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8593789339065552, + "num_tokens": 140102825.0, + "step": 3843 + }, + { + "epoch": 0.7138347260909935, + "grad_norm": 1.476309895515442, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8732094764709473, + "num_tokens": 140138845.0, + "step": 3844 + }, + { + "epoch": 0.7140204271123491, + "grad_norm": 1.8202701807022095, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8498122692108154, + "num_tokens": 140170405.0, + "step": 3845 + }, + { + "epoch": 0.7142061281337048, + "grad_norm": 1.5513782501220703, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8589174151420593, + "num_tokens": 140206103.0, + "step": 3846 + }, + { + "epoch": 0.7143918291550604, + "grad_norm": 1.4984500408172607, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8594399690628052, + "num_tokens": 140244489.0, + "step": 3847 + }, + { + "epoch": 0.714577530176416, + "grad_norm": 1.5440144538879395, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8658799529075623, + "num_tokens": 140279167.0, + "step": 3848 + }, + { + "epoch": 0.7147632311977716, + "grad_norm": 1.5269170999526978, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8477743864059448, + "num_tokens": 140317598.0, + "step": 3849 + }, + { + "epoch": 0.7149489322191273, + "grad_norm": 1.495519757270813, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8661426305770874, + "num_tokens": 140355062.0, + "step": 3850 + }, + { + "epoch": 0.7151346332404829, + "grad_norm": 1.518225073814392, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8732935190200806, + "num_tokens": 140387844.0, + "step": 3851 + }, + { + "epoch": 0.7153203342618384, + "grad_norm": 1.460516095161438, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8626666069030762, + "num_tokens": 140428786.0, + "step": 3852 + }, + { + "epoch": 0.715506035283194, + "grad_norm": 1.548315167427063, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8666793704032898, + "num_tokens": 140462857.0, + "step": 3853 + }, + { + "epoch": 0.7156917363045496, + "grad_norm": 1.6800365447998047, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8776116371154785, + "num_tokens": 140499279.0, + "step": 3854 + }, + { + "epoch": 0.7158774373259053, + "grad_norm": 1.5226085186004639, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8668474555015564, + "num_tokens": 140534862.0, + "step": 3855 + }, + { + "epoch": 0.7160631383472609, + "grad_norm": 1.5156209468841553, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8645057082176208, + "num_tokens": 140569604.0, + "step": 3856 + }, + { + "epoch": 0.7162488393686165, + "grad_norm": 1.5334432125091553, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8624799251556396, + "num_tokens": 140605224.0, + "step": 3857 + }, + { + "epoch": 0.7164345403899721, + "grad_norm": 1.4550104141235352, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8651145696640015, + "num_tokens": 140642779.0, + "step": 3858 + }, + { + "epoch": 0.7166202414113277, + "grad_norm": 1.4465818405151367, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8749228715896606, + "num_tokens": 140680826.0, + "step": 3859 + }, + { + "epoch": 0.7168059424326834, + "grad_norm": 1.4454933404922485, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8800408840179443, + "num_tokens": 140720621.0, + "step": 3860 + }, + { + "epoch": 0.716991643454039, + "grad_norm": 1.4279342889785767, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8769389390945435, + "num_tokens": 140759815.0, + "step": 3861 + }, + { + "epoch": 0.7171773444753946, + "grad_norm": 1.445630669593811, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8702276349067688, + "num_tokens": 140797539.0, + "step": 3862 + }, + { + "epoch": 0.7173630454967502, + "grad_norm": 1.4602198600769043, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8536031246185303, + "num_tokens": 140836332.0, + "step": 3863 + }, + { + "epoch": 0.7175487465181059, + "grad_norm": 1.5246152877807617, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8804448843002319, + "num_tokens": 140869749.0, + "step": 3864 + }, + { + "epoch": 0.7177344475394615, + "grad_norm": 1.626213550567627, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8603076934814453, + "num_tokens": 140904616.0, + "step": 3865 + }, + { + "epoch": 0.7179201485608171, + "grad_norm": 1.5590800046920776, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8658637404441833, + "num_tokens": 140944158.0, + "step": 3866 + }, + { + "epoch": 0.7181058495821727, + "grad_norm": 1.450695514678955, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8726152181625366, + "num_tokens": 140981621.0, + "step": 3867 + }, + { + "epoch": 0.7182915506035283, + "grad_norm": 1.509583830833435, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8779815435409546, + "num_tokens": 141016474.0, + "step": 3868 + }, + { + "epoch": 0.718477251624884, + "grad_norm": 1.536167860031128, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8723821640014648, + "num_tokens": 141052716.0, + "step": 3869 + }, + { + "epoch": 0.7186629526462396, + "grad_norm": 1.6027988195419312, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8886198997497559, + "num_tokens": 141082722.0, + "step": 3870 + }, + { + "epoch": 0.7188486536675952, + "grad_norm": 1.426867127418518, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8731681704521179, + "num_tokens": 141122790.0, + "step": 3871 + }, + { + "epoch": 0.7190343546889508, + "grad_norm": 1.602124810218811, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8839426040649414, + "num_tokens": 141157562.0, + "step": 3872 + }, + { + "epoch": 0.7192200557103064, + "grad_norm": 1.571956753730774, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8610990047454834, + "num_tokens": 141191914.0, + "step": 3873 + }, + { + "epoch": 0.7194057567316621, + "grad_norm": 1.6745353937149048, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8462982773780823, + "num_tokens": 141225395.0, + "step": 3874 + }, + { + "epoch": 0.7195914577530177, + "grad_norm": 1.4834017753601074, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8626895546913147, + "num_tokens": 141266500.0, + "step": 3875 + }, + { + "epoch": 0.7197771587743732, + "grad_norm": 1.483720064163208, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8694428205490112, + "num_tokens": 141303014.0, + "step": 3876 + }, + { + "epoch": 0.7199628597957288, + "grad_norm": 1.6031512022018433, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8643757104873657, + "num_tokens": 141339320.0, + "step": 3877 + }, + { + "epoch": 0.7201485608170844, + "grad_norm": 1.521888256072998, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.860924243927002, + "num_tokens": 141378254.0, + "step": 3878 + }, + { + "epoch": 0.7203342618384401, + "grad_norm": 1.574257731437683, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8637711405754089, + "num_tokens": 141412059.0, + "step": 3879 + }, + { + "epoch": 0.7205199628597957, + "grad_norm": 1.6162091493606567, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8505430817604065, + "num_tokens": 141449444.0, + "step": 3880 + }, + { + "epoch": 0.7207056638811513, + "grad_norm": 1.666640043258667, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.877220869064331, + "num_tokens": 141478357.0, + "step": 3881 + }, + { + "epoch": 0.7208913649025069, + "grad_norm": 1.4831842184066772, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.873915433883667, + "num_tokens": 141512909.0, + "step": 3882 + }, + { + "epoch": 0.7210770659238626, + "grad_norm": 1.5504006147384644, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8688308000564575, + "num_tokens": 141547996.0, + "step": 3883 + }, + { + "epoch": 0.7212627669452182, + "grad_norm": 1.4906938076019287, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8850418329238892, + "num_tokens": 141583589.0, + "step": 3884 + }, + { + "epoch": 0.7214484679665738, + "grad_norm": 1.5415467023849487, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8582833409309387, + "num_tokens": 141626497.0, + "step": 3885 + }, + { + "epoch": 0.7216341689879294, + "grad_norm": 1.5618610382080078, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8744474649429321, + "num_tokens": 141662613.0, + "step": 3886 + }, + { + "epoch": 0.721819870009285, + "grad_norm": 1.4892135858535767, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8726040124893188, + "num_tokens": 141698110.0, + "step": 3887 + }, + { + "epoch": 0.7220055710306407, + "grad_norm": 1.3984712362289429, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8716264963150024, + "num_tokens": 141738675.0, + "step": 3888 + }, + { + "epoch": 0.7221912720519963, + "grad_norm": 1.53062105178833, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.858620285987854, + "num_tokens": 141775434.0, + "step": 3889 + }, + { + "epoch": 0.7223769730733519, + "grad_norm": 1.5209026336669922, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8733060956001282, + "num_tokens": 141815010.0, + "step": 3890 + }, + { + "epoch": 0.7225626740947075, + "grad_norm": 1.7417658567428589, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8438600301742554, + "num_tokens": 141850997.0, + "step": 3891 + }, + { + "epoch": 0.7227483751160632, + "grad_norm": 1.7391200065612793, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8571512699127197, + "num_tokens": 141881667.0, + "step": 3892 + }, + { + "epoch": 0.7229340761374188, + "grad_norm": 1.536417007446289, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8615074157714844, + "num_tokens": 141916914.0, + "step": 3893 + }, + { + "epoch": 0.7231197771587744, + "grad_norm": 1.4337925910949707, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8625760674476624, + "num_tokens": 141959357.0, + "step": 3894 + }, + { + "epoch": 0.72330547818013, + "grad_norm": 1.4772312641143799, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8634487390518188, + "num_tokens": 142001117.0, + "step": 3895 + }, + { + "epoch": 0.7234911792014856, + "grad_norm": 1.6968368291854858, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8605248928070068, + "num_tokens": 142038681.0, + "step": 3896 + }, + { + "epoch": 0.7236768802228413, + "grad_norm": 1.4629942178726196, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8555851578712463, + "num_tokens": 142079263.0, + "step": 3897 + }, + { + "epoch": 0.7238625812441969, + "grad_norm": 1.519332766532898, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8679920434951782, + "num_tokens": 142113971.0, + "step": 3898 + }, + { + "epoch": 0.7240482822655525, + "grad_norm": 1.339820146560669, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8752009272575378, + "num_tokens": 142156279.0, + "step": 3899 + }, + { + "epoch": 0.724233983286908, + "grad_norm": 1.3317937850952148, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8685323596000671, + "num_tokens": 142197815.0, + "step": 3900 + }, + { + "epoch": 0.7244196843082636, + "grad_norm": 1.5047487020492554, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8577257394790649, + "num_tokens": 142239891.0, + "step": 3901 + }, + { + "epoch": 0.7246053853296193, + "grad_norm": 1.6598948240280151, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8794882297515869, + "num_tokens": 142280478.0, + "step": 3902 + }, + { + "epoch": 0.7247910863509749, + "grad_norm": 1.652862787246704, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8636917471885681, + "num_tokens": 142314477.0, + "step": 3903 + }, + { + "epoch": 0.7249767873723305, + "grad_norm": 1.4919875860214233, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.875015139579773, + "num_tokens": 142351125.0, + "step": 3904 + }, + { + "epoch": 0.7251624883936861, + "grad_norm": 1.469140887260437, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8687018156051636, + "num_tokens": 142387655.0, + "step": 3905 + }, + { + "epoch": 0.7253481894150418, + "grad_norm": 1.604512333869934, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8666064143180847, + "num_tokens": 142424384.0, + "step": 3906 + }, + { + "epoch": 0.7255338904363974, + "grad_norm": 1.6607714891433716, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8582372665405273, + "num_tokens": 142460051.0, + "step": 3907 + }, + { + "epoch": 0.725719591457753, + "grad_norm": 1.581526279449463, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8665355443954468, + "num_tokens": 142493683.0, + "step": 3908 + }, + { + "epoch": 0.7259052924791086, + "grad_norm": 1.6148407459259033, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.868668258190155, + "num_tokens": 142529870.0, + "step": 3909 + }, + { + "epoch": 0.7260909935004642, + "grad_norm": 1.5998700857162476, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8638191223144531, + "num_tokens": 142568740.0, + "step": 3910 + }, + { + "epoch": 0.7262766945218199, + "grad_norm": 1.6841636896133423, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.856069803237915, + "num_tokens": 142603131.0, + "step": 3911 + }, + { + "epoch": 0.7264623955431755, + "grad_norm": 1.4800958633422852, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8641854524612427, + "num_tokens": 142641745.0, + "step": 3912 + }, + { + "epoch": 0.7266480965645311, + "grad_norm": 1.5954533815383911, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.85423743724823, + "num_tokens": 142678741.0, + "step": 3913 + }, + { + "epoch": 0.7268337975858867, + "grad_norm": 1.7813773155212402, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8693651556968689, + "num_tokens": 142708443.0, + "step": 3914 + }, + { + "epoch": 0.7270194986072424, + "grad_norm": 1.6137259006500244, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8595119714736938, + "num_tokens": 142741918.0, + "step": 3915 + }, + { + "epoch": 0.727205199628598, + "grad_norm": 1.6377185583114624, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8631867170333862, + "num_tokens": 142776680.0, + "step": 3916 + }, + { + "epoch": 0.7273909006499536, + "grad_norm": 1.5169070959091187, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8662930727005005, + "num_tokens": 142813147.0, + "step": 3917 + }, + { + "epoch": 0.7275766016713092, + "grad_norm": 1.513648271560669, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8582602739334106, + "num_tokens": 142850550.0, + "step": 3918 + }, + { + "epoch": 0.7277623026926648, + "grad_norm": 1.459726095199585, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8613626956939697, + "num_tokens": 142892369.0, + "step": 3919 + }, + { + "epoch": 0.7279480037140205, + "grad_norm": 1.4747397899627686, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8739082217216492, + "num_tokens": 142928827.0, + "step": 3920 + }, + { + "epoch": 0.7281337047353761, + "grad_norm": 1.3911120891571045, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8730388283729553, + "num_tokens": 142969221.0, + "step": 3921 + }, + { + "epoch": 0.7283194057567317, + "grad_norm": 1.479308843612671, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8611255288124084, + "num_tokens": 143008114.0, + "step": 3922 + }, + { + "epoch": 0.7285051067780873, + "grad_norm": 1.516387701034546, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8573475480079651, + "num_tokens": 143045548.0, + "step": 3923 + }, + { + "epoch": 0.728690807799443, + "grad_norm": 1.4470160007476807, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8798843622207642, + "num_tokens": 143078333.0, + "step": 3924 + }, + { + "epoch": 0.7288765088207985, + "grad_norm": 1.4166055917739868, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8781938552856445, + "num_tokens": 143116034.0, + "step": 3925 + }, + { + "epoch": 0.7290622098421541, + "grad_norm": 1.507065773010254, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8777732849121094, + "num_tokens": 143152536.0, + "step": 3926 + }, + { + "epoch": 0.7292479108635097, + "grad_norm": 1.566879153251648, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8486608266830444, + "num_tokens": 143188211.0, + "step": 3927 + }, + { + "epoch": 0.7294336118848653, + "grad_norm": 1.5831642150878906, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8547226190567017, + "num_tokens": 143221825.0, + "step": 3928 + }, + { + "epoch": 0.729619312906221, + "grad_norm": 1.5706344842910767, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.869999349117279, + "num_tokens": 143252170.0, + "step": 3929 + }, + { + "epoch": 0.7298050139275766, + "grad_norm": 1.5003294944763184, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8894397020339966, + "num_tokens": 143284470.0, + "step": 3930 + }, + { + "epoch": 0.7299907149489322, + "grad_norm": 1.6933969259262085, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8512572050094604, + "num_tokens": 143316701.0, + "step": 3931 + }, + { + "epoch": 0.7301764159702878, + "grad_norm": 1.5800565481185913, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.868380069732666, + "num_tokens": 143349901.0, + "step": 3932 + }, + { + "epoch": 0.7303621169916434, + "grad_norm": 1.5167629718780518, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.873439371585846, + "num_tokens": 143385972.0, + "step": 3933 + }, + { + "epoch": 0.7305478180129991, + "grad_norm": 1.3419309854507446, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8804281949996948, + "num_tokens": 143426696.0, + "step": 3934 + }, + { + "epoch": 0.7307335190343547, + "grad_norm": 1.5471175909042358, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8825456500053406, + "num_tokens": 143461559.0, + "step": 3935 + }, + { + "epoch": 0.7309192200557103, + "grad_norm": 1.731788158416748, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8573876023292542, + "num_tokens": 143493203.0, + "step": 3936 + }, + { + "epoch": 0.7311049210770659, + "grad_norm": 1.6033351421356201, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8567720055580139, + "num_tokens": 143531314.0, + "step": 3937 + }, + { + "epoch": 0.7312906220984215, + "grad_norm": 1.450366735458374, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8684078454971313, + "num_tokens": 143572317.0, + "step": 3938 + }, + { + "epoch": 0.7314763231197772, + "grad_norm": 1.489717721939087, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8790425062179565, + "num_tokens": 143608132.0, + "step": 3939 + }, + { + "epoch": 0.7316620241411328, + "grad_norm": 1.6645492315292358, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8538146018981934, + "num_tokens": 143640344.0, + "step": 3940 + }, + { + "epoch": 0.7318477251624884, + "grad_norm": 1.5573408603668213, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.851298451423645, + "num_tokens": 143678073.0, + "step": 3941 + }, + { + "epoch": 0.732033426183844, + "grad_norm": 1.6995583772659302, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.856076717376709, + "num_tokens": 143708664.0, + "step": 3942 + }, + { + "epoch": 0.7322191272051997, + "grad_norm": 1.4751815795898438, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8769199848175049, + "num_tokens": 143740790.0, + "step": 3943 + }, + { + "epoch": 0.7324048282265553, + "grad_norm": 1.7078641653060913, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8754175901412964, + "num_tokens": 143770650.0, + "step": 3944 + }, + { + "epoch": 0.7325905292479109, + "grad_norm": 1.560938835144043, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8709962964057922, + "num_tokens": 143803636.0, + "step": 3945 + }, + { + "epoch": 0.7327762302692665, + "grad_norm": 1.450739860534668, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8684267997741699, + "num_tokens": 143842519.0, + "step": 3946 + }, + { + "epoch": 0.7329619312906221, + "grad_norm": 1.5495015382766724, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8578778505325317, + "num_tokens": 143877288.0, + "step": 3947 + }, + { + "epoch": 0.7331476323119778, + "grad_norm": 1.5912435054779053, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8823490738868713, + "num_tokens": 143908857.0, + "step": 3948 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.4322632551193237, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8727131485939026, + "num_tokens": 143948286.0, + "step": 3949 + }, + { + "epoch": 0.7335190343546889, + "grad_norm": 1.5695503950119019, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8779916763305664, + "num_tokens": 143983446.0, + "step": 3950 + }, + { + "epoch": 0.7337047353760445, + "grad_norm": 1.4842760562896729, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8720196485519409, + "num_tokens": 144019804.0, + "step": 3951 + }, + { + "epoch": 0.7338904363974001, + "grad_norm": 1.4881441593170166, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.866340696811676, + "num_tokens": 144059905.0, + "step": 3952 + }, + { + "epoch": 0.7340761374187558, + "grad_norm": 1.5602153539657593, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8591355085372925, + "num_tokens": 144096646.0, + "step": 3953 + }, + { + "epoch": 0.7342618384401114, + "grad_norm": 1.499855637550354, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8622305989265442, + "num_tokens": 144137924.0, + "step": 3954 + }, + { + "epoch": 0.734447539461467, + "grad_norm": 1.550703525543213, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.849474310874939, + "num_tokens": 144176441.0, + "step": 3955 + }, + { + "epoch": 0.7346332404828226, + "grad_norm": 1.4798790216445923, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8747789263725281, + "num_tokens": 144213698.0, + "step": 3956 + }, + { + "epoch": 0.7348189415041783, + "grad_norm": 1.4333380460739136, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.865378737449646, + "num_tokens": 144257598.0, + "step": 3957 + }, + { + "epoch": 0.7350046425255339, + "grad_norm": 1.566186785697937, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8730685710906982, + "num_tokens": 144289466.0, + "step": 3958 + }, + { + "epoch": 0.7351903435468895, + "grad_norm": 1.4651471376419067, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8689320087432861, + "num_tokens": 144327674.0, + "step": 3959 + }, + { + "epoch": 0.7353760445682451, + "grad_norm": 1.5005648136138916, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8642991185188293, + "num_tokens": 144365608.0, + "step": 3960 + }, + { + "epoch": 0.7355617455896007, + "grad_norm": 1.563528299331665, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8654407858848572, + "num_tokens": 144402267.0, + "step": 3961 + }, + { + "epoch": 0.7357474466109564, + "grad_norm": 1.4743527173995972, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8618449568748474, + "num_tokens": 144440944.0, + "step": 3962 + }, + { + "epoch": 0.735933147632312, + "grad_norm": 1.5127286911010742, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8721828460693359, + "num_tokens": 144475407.0, + "step": 3963 + }, + { + "epoch": 0.7361188486536676, + "grad_norm": 1.3959715366363525, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8676755428314209, + "num_tokens": 144517749.0, + "step": 3964 + }, + { + "epoch": 0.7363045496750232, + "grad_norm": 1.4982666969299316, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.850555419921875, + "num_tokens": 144556688.0, + "step": 3965 + }, + { + "epoch": 0.7364902506963789, + "grad_norm": 1.3843326568603516, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8688341379165649, + "num_tokens": 144598855.0, + "step": 3966 + }, + { + "epoch": 0.7366759517177345, + "grad_norm": 1.4045339822769165, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8801388144493103, + "num_tokens": 144638805.0, + "step": 3967 + }, + { + "epoch": 0.7368616527390901, + "grad_norm": 1.4039649963378906, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.868928074836731, + "num_tokens": 144679278.0, + "step": 3968 + }, + { + "epoch": 0.7370473537604457, + "grad_norm": 1.4724316596984863, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8637624979019165, + "num_tokens": 144720677.0, + "step": 3969 + }, + { + "epoch": 0.7372330547818013, + "grad_norm": 1.4405255317687988, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8642984628677368, + "num_tokens": 144759942.0, + "step": 3970 + }, + { + "epoch": 0.737418755803157, + "grad_norm": 1.3692716360092163, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8851375579833984, + "num_tokens": 144800993.0, + "step": 3971 + }, + { + "epoch": 0.7376044568245126, + "grad_norm": 1.4873936176300049, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8698413372039795, + "num_tokens": 144836193.0, + "step": 3972 + }, + { + "epoch": 0.7377901578458681, + "grad_norm": 1.6534336805343628, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8741916418075562, + "num_tokens": 144867627.0, + "step": 3973 + }, + { + "epoch": 0.7379758588672237, + "grad_norm": 1.5884249210357666, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8765751123428345, + "num_tokens": 144904625.0, + "step": 3974 + }, + { + "epoch": 0.7381615598885793, + "grad_norm": 1.6057604551315308, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8486799001693726, + "num_tokens": 144937267.0, + "step": 3975 + }, + { + "epoch": 0.738347260909935, + "grad_norm": 1.4533252716064453, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8665610551834106, + "num_tokens": 144977285.0, + "step": 3976 + }, + { + "epoch": 0.7385329619312906, + "grad_norm": 1.5086371898651123, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8686800003051758, + "num_tokens": 145014400.0, + "step": 3977 + }, + { + "epoch": 0.7387186629526462, + "grad_norm": 1.5165399312973022, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8624714016914368, + "num_tokens": 145058308.0, + "step": 3978 + }, + { + "epoch": 0.7389043639740018, + "grad_norm": 1.4474400281906128, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8622883558273315, + "num_tokens": 145098005.0, + "step": 3979 + }, + { + "epoch": 0.7390900649953575, + "grad_norm": 1.5037977695465088, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8647143840789795, + "num_tokens": 145139329.0, + "step": 3980 + }, + { + "epoch": 0.7392757660167131, + "grad_norm": 1.4678890705108643, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8741083145141602, + "num_tokens": 145179127.0, + "step": 3981 + }, + { + "epoch": 0.7394614670380687, + "grad_norm": 1.646865725517273, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8678085803985596, + "num_tokens": 145214097.0, + "step": 3982 + }, + { + "epoch": 0.7396471680594243, + "grad_norm": 1.597733974456787, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8755437135696411, + "num_tokens": 145244523.0, + "step": 3983 + }, + { + "epoch": 0.7398328690807799, + "grad_norm": 1.4631739854812622, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8635151982307434, + "num_tokens": 145287578.0, + "step": 3984 + }, + { + "epoch": 0.7400185701021356, + "grad_norm": 1.5222556591033936, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8765422105789185, + "num_tokens": 145323774.0, + "step": 3985 + }, + { + "epoch": 0.7402042711234912, + "grad_norm": 1.6477878093719482, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8570573925971985, + "num_tokens": 145354738.0, + "step": 3986 + }, + { + "epoch": 0.7403899721448468, + "grad_norm": 1.5403153896331787, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8763990998268127, + "num_tokens": 145387736.0, + "step": 3987 + }, + { + "epoch": 0.7405756731662024, + "grad_norm": 1.5161902904510498, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8569232225418091, + "num_tokens": 145428431.0, + "step": 3988 + }, + { + "epoch": 0.740761374187558, + "grad_norm": 1.5692397356033325, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8716827630996704, + "num_tokens": 145464496.0, + "step": 3989 + }, + { + "epoch": 0.7409470752089137, + "grad_norm": 1.5174602270126343, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8610589504241943, + "num_tokens": 145502683.0, + "step": 3990 + }, + { + "epoch": 0.7411327762302693, + "grad_norm": 1.432454228401184, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8649416565895081, + "num_tokens": 145546383.0, + "step": 3991 + }, + { + "epoch": 0.7413184772516249, + "grad_norm": 1.523197054862976, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8837577104568481, + "num_tokens": 145580506.0, + "step": 3992 + }, + { + "epoch": 0.7415041782729805, + "grad_norm": 1.5227718353271484, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8520481586456299, + "num_tokens": 145619151.0, + "step": 3993 + }, + { + "epoch": 0.7416898792943362, + "grad_norm": 1.4931154251098633, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8654305934906006, + "num_tokens": 145660999.0, + "step": 3994 + }, + { + "epoch": 0.7418755803156918, + "grad_norm": 1.4411613941192627, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8634989261627197, + "num_tokens": 145700403.0, + "step": 3995 + }, + { + "epoch": 0.7420612813370474, + "grad_norm": 1.3677624464035034, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.869259238243103, + "num_tokens": 145743476.0, + "step": 3996 + }, + { + "epoch": 0.7422469823584029, + "grad_norm": 1.5337457656860352, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8711441159248352, + "num_tokens": 145778635.0, + "step": 3997 + }, + { + "epoch": 0.7424326833797585, + "grad_norm": 1.5526238679885864, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8679157495498657, + "num_tokens": 145813375.0, + "step": 3998 + }, + { + "epoch": 0.7426183844011142, + "grad_norm": 1.5141531229019165, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8683309555053711, + "num_tokens": 145849515.0, + "step": 3999 + }, + { + "epoch": 0.7428040854224698, + "grad_norm": 1.6373049020767212, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8699547052383423, + "num_tokens": 145883892.0, + "step": 4000 + }, + { + "epoch": 0.7429897864438254, + "grad_norm": 1.6363931894302368, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8385301828384399, + "num_tokens": 145920090.0, + "step": 4001 + }, + { + "epoch": 0.743175487465181, + "grad_norm": 1.5632097721099854, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.879981517791748, + "num_tokens": 145951338.0, + "step": 4002 + }, + { + "epoch": 0.7433611884865367, + "grad_norm": 1.5423970222473145, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8601534366607666, + "num_tokens": 145987090.0, + "step": 4003 + }, + { + "epoch": 0.7435468895078923, + "grad_norm": 1.5680456161499023, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8715322017669678, + "num_tokens": 146019164.0, + "step": 4004 + }, + { + "epoch": 0.7437325905292479, + "grad_norm": 1.6495189666748047, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8712817430496216, + "num_tokens": 146054908.0, + "step": 4005 + }, + { + "epoch": 0.7439182915506035, + "grad_norm": 1.5125415325164795, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.872234582901001, + "num_tokens": 146096962.0, + "step": 4006 + }, + { + "epoch": 0.7441039925719591, + "grad_norm": 1.4923008680343628, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8763198852539062, + "num_tokens": 146131154.0, + "step": 4007 + }, + { + "epoch": 0.7442896935933148, + "grad_norm": 1.4955180883407593, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8672522306442261, + "num_tokens": 146167249.0, + "step": 4008 + }, + { + "epoch": 0.7444753946146704, + "grad_norm": 1.5466500520706177, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8732988834381104, + "num_tokens": 146200596.0, + "step": 4009 + }, + { + "epoch": 0.744661095636026, + "grad_norm": 1.5273293256759644, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8818049430847168, + "num_tokens": 146231609.0, + "step": 4010 + }, + { + "epoch": 0.7448467966573816, + "grad_norm": 1.6382668018341064, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8676148653030396, + "num_tokens": 146263558.0, + "step": 4011 + }, + { + "epoch": 0.7450324976787372, + "grad_norm": 1.4189352989196777, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8841196298599243, + "num_tokens": 146301809.0, + "step": 4012 + }, + { + "epoch": 0.7452181987000929, + "grad_norm": 1.5880053043365479, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.866649866104126, + "num_tokens": 146335978.0, + "step": 4013 + }, + { + "epoch": 0.7454038997214485, + "grad_norm": 1.53419029712677, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8617404103279114, + "num_tokens": 146371213.0, + "step": 4014 + }, + { + "epoch": 0.7455896007428041, + "grad_norm": 1.513472557067871, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8672302961349487, + "num_tokens": 146404854.0, + "step": 4015 + }, + { + "epoch": 0.7457753017641597, + "grad_norm": 1.3973159790039062, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8740372657775879, + "num_tokens": 146444256.0, + "step": 4016 + }, + { + "epoch": 0.7459610027855154, + "grad_norm": 1.493262529373169, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.872623860836029, + "num_tokens": 146480731.0, + "step": 4017 + }, + { + "epoch": 0.746146703806871, + "grad_norm": 1.546578288078308, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8541803956031799, + "num_tokens": 146514973.0, + "step": 4018 + }, + { + "epoch": 0.7463324048282266, + "grad_norm": 1.515527606010437, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8608665466308594, + "num_tokens": 146555157.0, + "step": 4019 + }, + { + "epoch": 0.7465181058495822, + "grad_norm": 1.6076053380966187, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8621682524681091, + "num_tokens": 146588272.0, + "step": 4020 + }, + { + "epoch": 0.7467038068709377, + "grad_norm": 1.6491613388061523, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8578420281410217, + "num_tokens": 146624493.0, + "step": 4021 + }, + { + "epoch": 0.7468895078922934, + "grad_norm": 1.8614524602890015, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.857919454574585, + "num_tokens": 146656867.0, + "step": 4022 + }, + { + "epoch": 0.747075208913649, + "grad_norm": 1.5364738702774048, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8769850730895996, + "num_tokens": 146690777.0, + "step": 4023 + }, + { + "epoch": 0.7472609099350046, + "grad_norm": 1.4744949340820312, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8720914125442505, + "num_tokens": 146729340.0, + "step": 4024 + }, + { + "epoch": 0.7474466109563602, + "grad_norm": 1.689104437828064, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8494401574134827, + "num_tokens": 146761518.0, + "step": 4025 + }, + { + "epoch": 0.7476323119777158, + "grad_norm": 1.5156782865524292, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8682091236114502, + "num_tokens": 146797147.0, + "step": 4026 + }, + { + "epoch": 0.7478180129990715, + "grad_norm": 1.6065330505371094, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8596584796905518, + "num_tokens": 146832887.0, + "step": 4027 + }, + { + "epoch": 0.7480037140204271, + "grad_norm": 1.6336956024169922, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8655481934547424, + "num_tokens": 146864688.0, + "step": 4028 + }, + { + "epoch": 0.7481894150417827, + "grad_norm": 1.5627334117889404, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8731702566146851, + "num_tokens": 146900301.0, + "step": 4029 + }, + { + "epoch": 0.7483751160631383, + "grad_norm": 1.5333060026168823, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.858867883682251, + "num_tokens": 146934206.0, + "step": 4030 + }, + { + "epoch": 0.748560817084494, + "grad_norm": 1.428916335105896, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8818086385726929, + "num_tokens": 146970158.0, + "step": 4031 + }, + { + "epoch": 0.7487465181058496, + "grad_norm": 1.6777081489562988, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8623192310333252, + "num_tokens": 147002942.0, + "step": 4032 + }, + { + "epoch": 0.7489322191272052, + "grad_norm": 1.5242723226547241, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.875138521194458, + "num_tokens": 147039135.0, + "step": 4033 + }, + { + "epoch": 0.7491179201485608, + "grad_norm": 1.5838252305984497, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8606715202331543, + "num_tokens": 147073137.0, + "step": 4034 + }, + { + "epoch": 0.7493036211699164, + "grad_norm": 1.7140289545059204, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8516000509262085, + "num_tokens": 147105307.0, + "step": 4035 + }, + { + "epoch": 0.7494893221912721, + "grad_norm": 1.5647985935211182, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8688691854476929, + "num_tokens": 147139414.0, + "step": 4036 + }, + { + "epoch": 0.7496750232126277, + "grad_norm": 1.668750286102295, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8640919327735901, + "num_tokens": 147168219.0, + "step": 4037 + }, + { + "epoch": 0.7498607242339833, + "grad_norm": 1.5685696601867676, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8608804941177368, + "num_tokens": 147207431.0, + "step": 4038 + }, + { + "epoch": 0.7500464252553389, + "grad_norm": 1.5877289772033691, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8539688587188721, + "num_tokens": 147245153.0, + "step": 4039 + }, + { + "epoch": 0.7502321262766946, + "grad_norm": 1.5033091306686401, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8691211342811584, + "num_tokens": 147281466.0, + "step": 4040 + }, + { + "epoch": 0.7504178272980502, + "grad_norm": 1.49905264377594, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8690472841262817, + "num_tokens": 147320132.0, + "step": 4041 + }, + { + "epoch": 0.7506035283194058, + "grad_norm": 1.61111581325531, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8659775257110596, + "num_tokens": 147355698.0, + "step": 4042 + }, + { + "epoch": 0.7507892293407614, + "grad_norm": 1.6582399606704712, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8411189317703247, + "num_tokens": 147391320.0, + "step": 4043 + }, + { + "epoch": 0.750974930362117, + "grad_norm": 1.5394346714019775, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8671056032180786, + "num_tokens": 147430688.0, + "step": 4044 + }, + { + "epoch": 0.7511606313834726, + "grad_norm": 1.6124775409698486, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.861031174659729, + "num_tokens": 147465201.0, + "step": 4045 + }, + { + "epoch": 0.7513463324048282, + "grad_norm": 1.6245434284210205, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8572514057159424, + "num_tokens": 147500589.0, + "step": 4046 + }, + { + "epoch": 0.7515320334261838, + "grad_norm": 1.5848599672317505, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.871839702129364, + "num_tokens": 147535181.0, + "step": 4047 + }, + { + "epoch": 0.7517177344475394, + "grad_norm": 1.5513888597488403, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.870672345161438, + "num_tokens": 147567069.0, + "step": 4048 + }, + { + "epoch": 0.751903435468895, + "grad_norm": 1.495694637298584, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8768114447593689, + "num_tokens": 147604419.0, + "step": 4049 + }, + { + "epoch": 0.7520891364902507, + "grad_norm": 1.6612966060638428, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8531875610351562, + "num_tokens": 147639413.0, + "step": 4050 + }, + { + "epoch": 0.7522748375116063, + "grad_norm": 1.672332525253296, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8758343458175659, + "num_tokens": 147671259.0, + "step": 4051 + }, + { + "epoch": 0.7524605385329619, + "grad_norm": 1.3690623044967651, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8611222505569458, + "num_tokens": 147715996.0, + "step": 4052 + }, + { + "epoch": 0.7526462395543175, + "grad_norm": 1.6885476112365723, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8675357699394226, + "num_tokens": 147747729.0, + "step": 4053 + }, + { + "epoch": 0.7528319405756732, + "grad_norm": 1.5261794328689575, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.882490336894989, + "num_tokens": 147783604.0, + "step": 4054 + }, + { + "epoch": 0.7530176415970288, + "grad_norm": 1.5102152824401855, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8420863151550293, + "num_tokens": 147828365.0, + "step": 4055 + }, + { + "epoch": 0.7532033426183844, + "grad_norm": 1.6290462017059326, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8625918030738831, + "num_tokens": 147863109.0, + "step": 4056 + }, + { + "epoch": 0.75338904363974, + "grad_norm": 1.6206378936767578, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8803126215934753, + "num_tokens": 147893564.0, + "step": 4057 + }, + { + "epoch": 0.7535747446610956, + "grad_norm": 1.7727034091949463, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8620954751968384, + "num_tokens": 147923977.0, + "step": 4058 + }, + { + "epoch": 0.7537604456824513, + "grad_norm": 1.7499414682388306, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8616624474525452, + "num_tokens": 147954947.0, + "step": 4059 + }, + { + "epoch": 0.7539461467038069, + "grad_norm": 1.6752537488937378, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8638389110565186, + "num_tokens": 147986232.0, + "step": 4060 + }, + { + "epoch": 0.7541318477251625, + "grad_norm": 1.5312963724136353, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8618491888046265, + "num_tokens": 148023122.0, + "step": 4061 + }, + { + "epoch": 0.7543175487465181, + "grad_norm": 1.5719932317733765, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8658655881881714, + "num_tokens": 148059405.0, + "step": 4062 + }, + { + "epoch": 0.7545032497678738, + "grad_norm": 1.4743435382843018, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8829210996627808, + "num_tokens": 148094733.0, + "step": 4063 + }, + { + "epoch": 0.7546889507892294, + "grad_norm": 1.5230867862701416, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8668341040611267, + "num_tokens": 148129348.0, + "step": 4064 + }, + { + "epoch": 0.754874651810585, + "grad_norm": 1.5975862741470337, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8770791888237, + "num_tokens": 148161929.0, + "step": 4065 + }, + { + "epoch": 0.7550603528319406, + "grad_norm": 1.5245084762573242, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8729585409164429, + "num_tokens": 148196241.0, + "step": 4066 + }, + { + "epoch": 0.7552460538532962, + "grad_norm": 1.3734263181686401, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8741641044616699, + "num_tokens": 148237845.0, + "step": 4067 + }, + { + "epoch": 0.7554317548746519, + "grad_norm": 1.557126760482788, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8666344881057739, + "num_tokens": 148275452.0, + "step": 4068 + }, + { + "epoch": 0.7556174558960074, + "grad_norm": 1.5465737581253052, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8708664178848267, + "num_tokens": 148312634.0, + "step": 4069 + }, + { + "epoch": 0.755803156917363, + "grad_norm": 1.4765567779541016, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.866156816482544, + "num_tokens": 148350115.0, + "step": 4070 + }, + { + "epoch": 0.7559888579387186, + "grad_norm": 1.444970965385437, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.869277834892273, + "num_tokens": 148394524.0, + "step": 4071 + }, + { + "epoch": 0.7561745589600742, + "grad_norm": 1.6816606521606445, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8606032133102417, + "num_tokens": 148425845.0, + "step": 4072 + }, + { + "epoch": 0.7563602599814299, + "grad_norm": 1.6278730630874634, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8807412385940552, + "num_tokens": 148458508.0, + "step": 4073 + }, + { + "epoch": 0.7565459610027855, + "grad_norm": 1.4953190088272095, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.860845148563385, + "num_tokens": 148496316.0, + "step": 4074 + }, + { + "epoch": 0.7567316620241411, + "grad_norm": 1.6640828847885132, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8623664975166321, + "num_tokens": 148527218.0, + "step": 4075 + }, + { + "epoch": 0.7569173630454967, + "grad_norm": 1.6965404748916626, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8544617891311646, + "num_tokens": 148562247.0, + "step": 4076 + }, + { + "epoch": 0.7571030640668523, + "grad_norm": 1.5114790201187134, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8642348051071167, + "num_tokens": 148602408.0, + "step": 4077 + }, + { + "epoch": 0.757288765088208, + "grad_norm": 1.5480619668960571, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8661856651306152, + "num_tokens": 148641206.0, + "step": 4078 + }, + { + "epoch": 0.7574744661095636, + "grad_norm": 1.6398836374282837, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8725865483283997, + "num_tokens": 148672963.0, + "step": 4079 + }, + { + "epoch": 0.7576601671309192, + "grad_norm": 1.5473538637161255, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8674492835998535, + "num_tokens": 148705041.0, + "step": 4080 + }, + { + "epoch": 0.7578458681522748, + "grad_norm": 1.5686534643173218, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8738937377929688, + "num_tokens": 148740414.0, + "step": 4081 + }, + { + "epoch": 0.7580315691736305, + "grad_norm": 1.5114984512329102, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8593465685844421, + "num_tokens": 148775356.0, + "step": 4082 + }, + { + "epoch": 0.7582172701949861, + "grad_norm": 1.5953205823898315, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.862028956413269, + "num_tokens": 148810506.0, + "step": 4083 + }, + { + "epoch": 0.7584029712163417, + "grad_norm": 1.530269742012024, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8541624546051025, + "num_tokens": 148847520.0, + "step": 4084 + }, + { + "epoch": 0.7585886722376973, + "grad_norm": 1.4709365367889404, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8701184988021851, + "num_tokens": 148887482.0, + "step": 4085 + }, + { + "epoch": 0.758774373259053, + "grad_norm": 1.439600944519043, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.880608081817627, + "num_tokens": 148923409.0, + "step": 4086 + }, + { + "epoch": 0.7589600742804086, + "grad_norm": 1.7166532278060913, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8787652850151062, + "num_tokens": 148950674.0, + "step": 4087 + }, + { + "epoch": 0.7591457753017642, + "grad_norm": 1.3579065799713135, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8766722679138184, + "num_tokens": 148992271.0, + "step": 4088 + }, + { + "epoch": 0.7593314763231198, + "grad_norm": 1.595223069190979, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8761810660362244, + "num_tokens": 149024503.0, + "step": 4089 + }, + { + "epoch": 0.7595171773444754, + "grad_norm": 1.5826529264450073, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8607833981513977, + "num_tokens": 149059591.0, + "step": 4090 + }, + { + "epoch": 0.7597028783658311, + "grad_norm": 1.5693718194961548, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8660517930984497, + "num_tokens": 149093476.0, + "step": 4091 + }, + { + "epoch": 0.7598885793871867, + "grad_norm": 1.4942080974578857, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8602727055549622, + "num_tokens": 149129616.0, + "step": 4092 + }, + { + "epoch": 0.7600742804085423, + "grad_norm": 1.5326011180877686, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8793764114379883, + "num_tokens": 149162769.0, + "step": 4093 + }, + { + "epoch": 0.7602599814298978, + "grad_norm": 1.6717743873596191, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.870112955570221, + "num_tokens": 149192780.0, + "step": 4094 + }, + { + "epoch": 0.7604456824512534, + "grad_norm": 1.3645836114883423, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8703635334968567, + "num_tokens": 149238144.0, + "step": 4095 + }, + { + "epoch": 0.7606313834726091, + "grad_norm": 1.4238402843475342, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8676159977912903, + "num_tokens": 149277628.0, + "step": 4096 + }, + { + "epoch": 0.7608170844939647, + "grad_norm": 1.5235912799835205, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8525549173355103, + "num_tokens": 149315512.0, + "step": 4097 + }, + { + "epoch": 0.7610027855153203, + "grad_norm": 1.5295225381851196, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8704127073287964, + "num_tokens": 149351575.0, + "step": 4098 + }, + { + "epoch": 0.7611884865366759, + "grad_norm": 1.5637586116790771, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8620406985282898, + "num_tokens": 149386185.0, + "step": 4099 + }, + { + "epoch": 0.7613741875580315, + "grad_norm": 1.4871602058410645, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8726671934127808, + "num_tokens": 149426052.0, + "step": 4100 + }, + { + "epoch": 0.7615598885793872, + "grad_norm": 1.5126816034317017, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8759770393371582, + "num_tokens": 149462542.0, + "step": 4101 + }, + { + "epoch": 0.7617455896007428, + "grad_norm": 1.434862494468689, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8861168622970581, + "num_tokens": 149502527.0, + "step": 4102 + }, + { + "epoch": 0.7619312906220984, + "grad_norm": 1.531049370765686, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.870396077632904, + "num_tokens": 149537735.0, + "step": 4103 + }, + { + "epoch": 0.762116991643454, + "grad_norm": 1.5957540273666382, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8642340898513794, + "num_tokens": 149573636.0, + "step": 4104 + }, + { + "epoch": 0.7623026926648097, + "grad_norm": 1.743729829788208, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8625786900520325, + "num_tokens": 149608003.0, + "step": 4105 + }, + { + "epoch": 0.7624883936861653, + "grad_norm": 1.39577317237854, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8934877514839172, + "num_tokens": 149645080.0, + "step": 4106 + }, + { + "epoch": 0.7626740947075209, + "grad_norm": 1.392702341079712, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8678020238876343, + "num_tokens": 149684550.0, + "step": 4107 + }, + { + "epoch": 0.7628597957288765, + "grad_norm": 1.5709140300750732, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8744022250175476, + "num_tokens": 149718544.0, + "step": 4108 + }, + { + "epoch": 0.7630454967502321, + "grad_norm": 1.3820520639419556, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.873334527015686, + "num_tokens": 149757240.0, + "step": 4109 + }, + { + "epoch": 0.7632311977715878, + "grad_norm": 1.4113092422485352, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8714974522590637, + "num_tokens": 149799509.0, + "step": 4110 + }, + { + "epoch": 0.7634168987929434, + "grad_norm": 1.4935548305511475, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8620664477348328, + "num_tokens": 149835759.0, + "step": 4111 + }, + { + "epoch": 0.763602599814299, + "grad_norm": 1.4798671007156372, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8668967485427856, + "num_tokens": 149871621.0, + "step": 4112 + }, + { + "epoch": 0.7637883008356546, + "grad_norm": 1.4842889308929443, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8724674582481384, + "num_tokens": 149906086.0, + "step": 4113 + }, + { + "epoch": 0.7639740018570103, + "grad_norm": 1.530922532081604, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8451658487319946, + "num_tokens": 149948277.0, + "step": 4114 + }, + { + "epoch": 0.7641597028783659, + "grad_norm": 1.5697765350341797, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8710177540779114, + "num_tokens": 149982225.0, + "step": 4115 + }, + { + "epoch": 0.7643454038997215, + "grad_norm": 1.6895321607589722, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8689311146736145, + "num_tokens": 150015311.0, + "step": 4116 + }, + { + "epoch": 0.7645311049210771, + "grad_norm": 1.4407765865325928, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8682334423065186, + "num_tokens": 150055905.0, + "step": 4117 + }, + { + "epoch": 0.7647168059424326, + "grad_norm": 1.5021311044692993, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8847922682762146, + "num_tokens": 150086851.0, + "step": 4118 + }, + { + "epoch": 0.7649025069637883, + "grad_norm": 1.5007827281951904, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8517233729362488, + "num_tokens": 150127346.0, + "step": 4119 + }, + { + "epoch": 0.7650882079851439, + "grad_norm": 1.718079924583435, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8648877739906311, + "num_tokens": 150158389.0, + "step": 4120 + }, + { + "epoch": 0.7652739090064995, + "grad_norm": 1.6244481801986694, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8693869113922119, + "num_tokens": 150190670.0, + "step": 4121 + }, + { + "epoch": 0.7654596100278551, + "grad_norm": 1.5187276601791382, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8720563650131226, + "num_tokens": 150223431.0, + "step": 4122 + }, + { + "epoch": 0.7656453110492107, + "grad_norm": 1.4545503854751587, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8702768087387085, + "num_tokens": 150265878.0, + "step": 4123 + }, + { + "epoch": 0.7658310120705664, + "grad_norm": 1.462367296218872, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8703896403312683, + "num_tokens": 150306474.0, + "step": 4124 + }, + { + "epoch": 0.766016713091922, + "grad_norm": 1.375329613685608, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8800052404403687, + "num_tokens": 150350594.0, + "step": 4125 + }, + { + "epoch": 0.7662024141132776, + "grad_norm": 1.4559788703918457, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8741199970245361, + "num_tokens": 150392880.0, + "step": 4126 + }, + { + "epoch": 0.7663881151346332, + "grad_norm": 1.4981117248535156, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8562917113304138, + "num_tokens": 150431556.0, + "step": 4127 + }, + { + "epoch": 0.7665738161559889, + "grad_norm": 1.5108953714370728, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8556658029556274, + "num_tokens": 150469653.0, + "step": 4128 + }, + { + "epoch": 0.7667595171773445, + "grad_norm": 1.5515198707580566, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8665624260902405, + "num_tokens": 150505351.0, + "step": 4129 + }, + { + "epoch": 0.7669452181987001, + "grad_norm": 1.647278904914856, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.860709547996521, + "num_tokens": 150539922.0, + "step": 4130 + }, + { + "epoch": 0.7671309192200557, + "grad_norm": 1.5793938636779785, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8608763217926025, + "num_tokens": 150574606.0, + "step": 4131 + }, + { + "epoch": 0.7673166202414113, + "grad_norm": 1.5417698621749878, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8541445732116699, + "num_tokens": 150609116.0, + "step": 4132 + }, + { + "epoch": 0.767502321262767, + "grad_norm": 1.6214340925216675, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8648971319198608, + "num_tokens": 150641400.0, + "step": 4133 + }, + { + "epoch": 0.7676880222841226, + "grad_norm": 1.4405583143234253, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.871236264705658, + "num_tokens": 150678399.0, + "step": 4134 + }, + { + "epoch": 0.7678737233054782, + "grad_norm": 1.4538179636001587, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8659191131591797, + "num_tokens": 150719675.0, + "step": 4135 + }, + { + "epoch": 0.7680594243268338, + "grad_norm": 1.5882874727249146, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8691509962081909, + "num_tokens": 150750191.0, + "step": 4136 + }, + { + "epoch": 0.7682451253481895, + "grad_norm": 1.534356713294983, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8662316799163818, + "num_tokens": 150788210.0, + "step": 4137 + }, + { + "epoch": 0.7684308263695451, + "grad_norm": 1.5843874216079712, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8568152785301208, + "num_tokens": 150823587.0, + "step": 4138 + }, + { + "epoch": 0.7686165273909007, + "grad_norm": 1.4251304864883423, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8648384809494019, + "num_tokens": 150864082.0, + "step": 4139 + }, + { + "epoch": 0.7688022284122563, + "grad_norm": 1.5364210605621338, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8821064233779907, + "num_tokens": 150902219.0, + "step": 4140 + }, + { + "epoch": 0.7689879294336119, + "grad_norm": 1.4423490762710571, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8779168128967285, + "num_tokens": 150938613.0, + "step": 4141 + }, + { + "epoch": 0.7691736304549674, + "grad_norm": 1.5052188634872437, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8720800876617432, + "num_tokens": 150975831.0, + "step": 4142 + }, + { + "epoch": 0.7693593314763231, + "grad_norm": 1.4165538549423218, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8678125739097595, + "num_tokens": 151016305.0, + "step": 4143 + }, + { + "epoch": 0.7695450324976787, + "grad_norm": 1.418215274810791, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8766506910324097, + "num_tokens": 151055383.0, + "step": 4144 + }, + { + "epoch": 0.7697307335190343, + "grad_norm": 1.4567142724990845, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8710037469863892, + "num_tokens": 151090934.0, + "step": 4145 + }, + { + "epoch": 0.7699164345403899, + "grad_norm": 1.4338055849075317, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8588835000991821, + "num_tokens": 151133835.0, + "step": 4146 + }, + { + "epoch": 0.7701021355617456, + "grad_norm": 1.5273765325546265, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8672481179237366, + "num_tokens": 151170771.0, + "step": 4147 + }, + { + "epoch": 0.7702878365831012, + "grad_norm": 1.5046025514602661, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8626601696014404, + "num_tokens": 151209318.0, + "step": 4148 + }, + { + "epoch": 0.7704735376044568, + "grad_norm": 1.6051499843597412, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8617100715637207, + "num_tokens": 151246360.0, + "step": 4149 + }, + { + "epoch": 0.7706592386258124, + "grad_norm": 1.470252275466919, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8579708337783813, + "num_tokens": 151286785.0, + "step": 4150 + }, + { + "epoch": 0.770844939647168, + "grad_norm": 1.7633908987045288, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8500776290893555, + "num_tokens": 151319513.0, + "step": 4151 + }, + { + "epoch": 0.7710306406685237, + "grad_norm": 1.3520597219467163, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8744633197784424, + "num_tokens": 151361976.0, + "step": 4152 + }, + { + "epoch": 0.7712163416898793, + "grad_norm": 1.4061877727508545, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8715285062789917, + "num_tokens": 151405609.0, + "step": 4153 + }, + { + "epoch": 0.7714020427112349, + "grad_norm": 1.6480302810668945, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.863402247428894, + "num_tokens": 151437900.0, + "step": 4154 + }, + { + "epoch": 0.7715877437325905, + "grad_norm": 1.6065369844436646, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8535611629486084, + "num_tokens": 151474088.0, + "step": 4155 + }, + { + "epoch": 0.7717734447539462, + "grad_norm": 1.5364819765090942, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8704243302345276, + "num_tokens": 151510562.0, + "step": 4156 + }, + { + "epoch": 0.7719591457753018, + "grad_norm": 1.4230679273605347, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8888841867446899, + "num_tokens": 151546377.0, + "step": 4157 + }, + { + "epoch": 0.7721448467966574, + "grad_norm": 1.634508728981018, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8813660740852356, + "num_tokens": 151578980.0, + "step": 4158 + }, + { + "epoch": 0.772330547818013, + "grad_norm": 1.4848171472549438, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8731296062469482, + "num_tokens": 151615663.0, + "step": 4159 + }, + { + "epoch": 0.7725162488393686, + "grad_norm": 1.5038483142852783, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8634511828422546, + "num_tokens": 151655775.0, + "step": 4160 + }, + { + "epoch": 0.7727019498607243, + "grad_norm": 1.544689416885376, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8627230525016785, + "num_tokens": 151693299.0, + "step": 4161 + }, + { + "epoch": 0.7728876508820799, + "grad_norm": 1.3792165517807007, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8703543543815613, + "num_tokens": 151733533.0, + "step": 4162 + }, + { + "epoch": 0.7730733519034355, + "grad_norm": 1.6582235097885132, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.859826922416687, + "num_tokens": 151765629.0, + "step": 4163 + }, + { + "epoch": 0.7732590529247911, + "grad_norm": 1.4807461500167847, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.870621919631958, + "num_tokens": 151803650.0, + "step": 4164 + }, + { + "epoch": 0.7734447539461468, + "grad_norm": 1.5176664590835571, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8723985552787781, + "num_tokens": 151838367.0, + "step": 4165 + }, + { + "epoch": 0.7736304549675023, + "grad_norm": 1.606894612312317, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8755286335945129, + "num_tokens": 151869820.0, + "step": 4166 + }, + { + "epoch": 0.7738161559888579, + "grad_norm": 1.4957865476608276, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8686844706535339, + "num_tokens": 151906223.0, + "step": 4167 + }, + { + "epoch": 0.7740018570102135, + "grad_norm": 1.4280564785003662, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8659453392028809, + "num_tokens": 151945932.0, + "step": 4168 + }, + { + "epoch": 0.7741875580315691, + "grad_norm": 1.4212722778320312, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8634549379348755, + "num_tokens": 151989044.0, + "step": 4169 + }, + { + "epoch": 0.7743732590529248, + "grad_norm": 1.6208630800247192, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.867163896560669, + "num_tokens": 152023784.0, + "step": 4170 + }, + { + "epoch": 0.7745589600742804, + "grad_norm": 1.6600356101989746, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8700168132781982, + "num_tokens": 152058610.0, + "step": 4171 + }, + { + "epoch": 0.774744661095636, + "grad_norm": 1.3751263618469238, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8817538022994995, + "num_tokens": 152097737.0, + "step": 4172 + }, + { + "epoch": 0.7749303621169916, + "grad_norm": 1.5387768745422363, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8698477149009705, + "num_tokens": 152131861.0, + "step": 4173 + }, + { + "epoch": 0.7751160631383472, + "grad_norm": 1.5386533737182617, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8750281929969788, + "num_tokens": 152168075.0, + "step": 4174 + }, + { + "epoch": 0.7753017641597029, + "grad_norm": 1.5182123184204102, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8618336319923401, + "num_tokens": 152207012.0, + "step": 4175 + }, + { + "epoch": 0.7754874651810585, + "grad_norm": 1.519647479057312, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8781227469444275, + "num_tokens": 152241529.0, + "step": 4176 + }, + { + "epoch": 0.7756731662024141, + "grad_norm": 1.4681165218353271, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8709154725074768, + "num_tokens": 152277759.0, + "step": 4177 + }, + { + "epoch": 0.7758588672237697, + "grad_norm": 1.5143779516220093, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8513550758361816, + "num_tokens": 152314343.0, + "step": 4178 + }, + { + "epoch": 0.7760445682451254, + "grad_norm": 1.6396636962890625, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8420097827911377, + "num_tokens": 152350386.0, + "step": 4179 + }, + { + "epoch": 0.776230269266481, + "grad_norm": 1.4736768007278442, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8786271810531616, + "num_tokens": 152389831.0, + "step": 4180 + }, + { + "epoch": 0.7764159702878366, + "grad_norm": 1.5783780813217163, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8697953820228577, + "num_tokens": 152423702.0, + "step": 4181 + }, + { + "epoch": 0.7766016713091922, + "grad_norm": 1.5480796098709106, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8802312612533569, + "num_tokens": 152459250.0, + "step": 4182 + }, + { + "epoch": 0.7767873723305478, + "grad_norm": 1.5992399454116821, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8613199591636658, + "num_tokens": 152496379.0, + "step": 4183 + }, + { + "epoch": 0.7769730733519035, + "grad_norm": 1.505638599395752, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8742409944534302, + "num_tokens": 152533285.0, + "step": 4184 + }, + { + "epoch": 0.7771587743732591, + "grad_norm": 1.5757477283477783, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8795684576034546, + "num_tokens": 152567834.0, + "step": 4185 + }, + { + "epoch": 0.7773444753946147, + "grad_norm": 1.440436840057373, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8613372445106506, + "num_tokens": 152608524.0, + "step": 4186 + }, + { + "epoch": 0.7775301764159703, + "grad_norm": 1.7313233613967896, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8628292083740234, + "num_tokens": 152640788.0, + "step": 4187 + }, + { + "epoch": 0.777715877437326, + "grad_norm": 1.5166887044906616, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8658787608146667, + "num_tokens": 152678387.0, + "step": 4188 + }, + { + "epoch": 0.7779015784586816, + "grad_norm": 1.559706687927246, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8610435724258423, + "num_tokens": 152715332.0, + "step": 4189 + }, + { + "epoch": 0.7780872794800371, + "grad_norm": 1.6647812128067017, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8752343654632568, + "num_tokens": 152744994.0, + "step": 4190 + }, + { + "epoch": 0.7782729805013927, + "grad_norm": 1.4323636293411255, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8685044050216675, + "num_tokens": 152784345.0, + "step": 4191 + }, + { + "epoch": 0.7784586815227483, + "grad_norm": 1.5251182317733765, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8640562295913696, + "num_tokens": 152820321.0, + "step": 4192 + }, + { + "epoch": 0.778644382544104, + "grad_norm": 1.5773190259933472, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8730364441871643, + "num_tokens": 152851353.0, + "step": 4193 + }, + { + "epoch": 0.7788300835654596, + "grad_norm": 1.5499162673950195, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8693978190422058, + "num_tokens": 152884872.0, + "step": 4194 + }, + { + "epoch": 0.7790157845868152, + "grad_norm": 1.5435080528259277, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8650177717208862, + "num_tokens": 152925381.0, + "step": 4195 + }, + { + "epoch": 0.7792014856081708, + "grad_norm": 1.540621042251587, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.872043251991272, + "num_tokens": 152958854.0, + "step": 4196 + }, + { + "epoch": 0.7793871866295264, + "grad_norm": 1.5603280067443848, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8535465598106384, + "num_tokens": 152992585.0, + "step": 4197 + }, + { + "epoch": 0.7795728876508821, + "grad_norm": 1.5317305326461792, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8663105368614197, + "num_tokens": 153027153.0, + "step": 4198 + }, + { + "epoch": 0.7797585886722377, + "grad_norm": 1.3511343002319336, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8715361952781677, + "num_tokens": 153073176.0, + "step": 4199 + }, + { + "epoch": 0.7799442896935933, + "grad_norm": 1.6587461233139038, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8575836420059204, + "num_tokens": 153105142.0, + "step": 4200 + }, + { + "epoch": 0.7801299907149489, + "grad_norm": 1.4262324571609497, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8749011158943176, + "num_tokens": 153144952.0, + "step": 4201 + }, + { + "epoch": 0.7803156917363046, + "grad_norm": 1.5537209510803223, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8694216012954712, + "num_tokens": 153179179.0, + "step": 4202 + }, + { + "epoch": 0.7805013927576602, + "grad_norm": 1.5428979396820068, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8502482175827026, + "num_tokens": 153221487.0, + "step": 4203 + }, + { + "epoch": 0.7806870937790158, + "grad_norm": 1.5080335140228271, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8749780058860779, + "num_tokens": 153255824.0, + "step": 4204 + }, + { + "epoch": 0.7808727948003714, + "grad_norm": 1.4701721668243408, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8745801448822021, + "num_tokens": 153298372.0, + "step": 4205 + }, + { + "epoch": 0.781058495821727, + "grad_norm": 1.4178639650344849, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8630547523498535, + "num_tokens": 153338554.0, + "step": 4206 + }, + { + "epoch": 0.7812441968430827, + "grad_norm": 1.4331474304199219, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8718722462654114, + "num_tokens": 153375944.0, + "step": 4207 + }, + { + "epoch": 0.7814298978644383, + "grad_norm": 1.5523478984832764, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8717104196548462, + "num_tokens": 153412161.0, + "step": 4208 + }, + { + "epoch": 0.7816155988857939, + "grad_norm": 1.3670929670333862, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.881112813949585, + "num_tokens": 153453050.0, + "step": 4209 + }, + { + "epoch": 0.7818012999071495, + "grad_norm": 1.5000972747802734, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8709017038345337, + "num_tokens": 153490266.0, + "step": 4210 + }, + { + "epoch": 0.7819870009285051, + "grad_norm": 1.5046677589416504, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8660895824432373, + "num_tokens": 153525794.0, + "step": 4211 + }, + { + "epoch": 0.7821727019498608, + "grad_norm": 1.4881017208099365, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8777223229408264, + "num_tokens": 153563282.0, + "step": 4212 + }, + { + "epoch": 0.7823584029712164, + "grad_norm": 1.8876585960388184, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.867997407913208, + "num_tokens": 153595551.0, + "step": 4213 + }, + { + "epoch": 0.7825441039925719, + "grad_norm": 1.5454281568527222, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8599157333374023, + "num_tokens": 153636322.0, + "step": 4214 + }, + { + "epoch": 0.7827298050139275, + "grad_norm": 1.503023624420166, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8648863434791565, + "num_tokens": 153674421.0, + "step": 4215 + }, + { + "epoch": 0.7829155060352831, + "grad_norm": 1.4806115627288818, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.861212968826294, + "num_tokens": 153710789.0, + "step": 4216 + }, + { + "epoch": 0.7831012070566388, + "grad_norm": 1.501671552658081, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8758800625801086, + "num_tokens": 153746541.0, + "step": 4217 + }, + { + "epoch": 0.7832869080779944, + "grad_norm": 1.5749578475952148, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8633943796157837, + "num_tokens": 153781945.0, + "step": 4218 + }, + { + "epoch": 0.78347260909935, + "grad_norm": 1.6039711236953735, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8743516206741333, + "num_tokens": 153813079.0, + "step": 4219 + }, + { + "epoch": 0.7836583101207056, + "grad_norm": 1.6692978143692017, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8626335859298706, + "num_tokens": 153845506.0, + "step": 4220 + }, + { + "epoch": 0.7838440111420613, + "grad_norm": 1.6309205293655396, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8594741821289062, + "num_tokens": 153882916.0, + "step": 4221 + }, + { + "epoch": 0.7840297121634169, + "grad_norm": 1.4868905544281006, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8746958374977112, + "num_tokens": 153919491.0, + "step": 4222 + }, + { + "epoch": 0.7842154131847725, + "grad_norm": 1.3251888751983643, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8861899375915527, + "num_tokens": 153962466.0, + "step": 4223 + }, + { + "epoch": 0.7844011142061281, + "grad_norm": 1.5716924667358398, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8609697818756104, + "num_tokens": 153996658.0, + "step": 4224 + }, + { + "epoch": 0.7845868152274837, + "grad_norm": 1.4617815017700195, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.881895899772644, + "num_tokens": 154030981.0, + "step": 4225 + }, + { + "epoch": 0.7847725162488394, + "grad_norm": 1.5610820055007935, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8640789985656738, + "num_tokens": 154069074.0, + "step": 4226 + }, + { + "epoch": 0.784958217270195, + "grad_norm": 1.56863272190094, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8603873252868652, + "num_tokens": 154108618.0, + "step": 4227 + }, + { + "epoch": 0.7851439182915506, + "grad_norm": 1.4534903764724731, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8838340044021606, + "num_tokens": 154144209.0, + "step": 4228 + }, + { + "epoch": 0.7853296193129062, + "grad_norm": 1.4305747747421265, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8665359020233154, + "num_tokens": 154182432.0, + "step": 4229 + }, + { + "epoch": 0.7855153203342619, + "grad_norm": 1.475696086883545, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8685344457626343, + "num_tokens": 154220954.0, + "step": 4230 + }, + { + "epoch": 0.7857010213556175, + "grad_norm": 1.434601902961731, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8672246932983398, + "num_tokens": 154260185.0, + "step": 4231 + }, + { + "epoch": 0.7858867223769731, + "grad_norm": 1.639867901802063, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8514015674591064, + "num_tokens": 154292250.0, + "step": 4232 + }, + { + "epoch": 0.7860724233983287, + "grad_norm": 1.4763989448547363, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8549239039421082, + "num_tokens": 154333640.0, + "step": 4233 + }, + { + "epoch": 0.7862581244196843, + "grad_norm": 1.4986047744750977, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8740405440330505, + "num_tokens": 154372298.0, + "step": 4234 + }, + { + "epoch": 0.78644382544104, + "grad_norm": 1.5050965547561646, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8694338202476501, + "num_tokens": 154407413.0, + "step": 4235 + }, + { + "epoch": 0.7866295264623956, + "grad_norm": 1.5420891046524048, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8557302355766296, + "num_tokens": 154445922.0, + "step": 4236 + }, + { + "epoch": 0.7868152274837512, + "grad_norm": 1.4549964666366577, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8647811412811279, + "num_tokens": 154489044.0, + "step": 4237 + }, + { + "epoch": 0.7870009285051067, + "grad_norm": 1.4923557043075562, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8717917203903198, + "num_tokens": 154524353.0, + "step": 4238 + }, + { + "epoch": 0.7871866295264623, + "grad_norm": 1.4156520366668701, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8836181163787842, + "num_tokens": 154562060.0, + "step": 4239 + }, + { + "epoch": 0.787372330547818, + "grad_norm": 1.5505791902542114, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8713875412940979, + "num_tokens": 154595990.0, + "step": 4240 + }, + { + "epoch": 0.7875580315691736, + "grad_norm": 1.473474144935608, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8553708791732788, + "num_tokens": 154638355.0, + "step": 4241 + }, + { + "epoch": 0.7877437325905292, + "grad_norm": 1.538291335105896, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8522263169288635, + "num_tokens": 154677337.0, + "step": 4242 + }, + { + "epoch": 0.7879294336118848, + "grad_norm": 1.5677775144577026, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8411795496940613, + "num_tokens": 154717724.0, + "step": 4243 + }, + { + "epoch": 0.7881151346332405, + "grad_norm": 1.5740070343017578, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8711071610450745, + "num_tokens": 154754135.0, + "step": 4244 + }, + { + "epoch": 0.7883008356545961, + "grad_norm": 1.4531415700912476, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8598685264587402, + "num_tokens": 154796456.0, + "step": 4245 + }, + { + "epoch": 0.7884865366759517, + "grad_norm": 1.52119779586792, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.867250382900238, + "num_tokens": 154833279.0, + "step": 4246 + }, + { + "epoch": 0.7886722376973073, + "grad_norm": 1.5502394437789917, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8726793527603149, + "num_tokens": 154868171.0, + "step": 4247 + }, + { + "epoch": 0.7888579387186629, + "grad_norm": 1.5823005437850952, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8573567867279053, + "num_tokens": 154902089.0, + "step": 4248 + }, + { + "epoch": 0.7890436397400186, + "grad_norm": 1.6247944831848145, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8580107688903809, + "num_tokens": 154937615.0, + "step": 4249 + }, + { + "epoch": 0.7892293407613742, + "grad_norm": 1.472815990447998, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8668798208236694, + "num_tokens": 154978243.0, + "step": 4250 + }, + { + "epoch": 0.7894150417827298, + "grad_norm": 1.5198501348495483, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8690446615219116, + "num_tokens": 155012177.0, + "step": 4251 + }, + { + "epoch": 0.7896007428040854, + "grad_norm": 1.4182132482528687, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8804907202720642, + "num_tokens": 155048333.0, + "step": 4252 + }, + { + "epoch": 0.789786443825441, + "grad_norm": 1.463340163230896, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.878146767616272, + "num_tokens": 155083203.0, + "step": 4253 + }, + { + "epoch": 0.7899721448467967, + "grad_norm": 1.5536234378814697, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8729429244995117, + "num_tokens": 155114559.0, + "step": 4254 + }, + { + "epoch": 0.7901578458681523, + "grad_norm": 1.5824730396270752, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8646825551986694, + "num_tokens": 155148886.0, + "step": 4255 + }, + { + "epoch": 0.7903435468895079, + "grad_norm": 1.5751335620880127, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.861892580986023, + "num_tokens": 155184679.0, + "step": 4256 + }, + { + "epoch": 0.7905292479108635, + "grad_norm": 1.5658538341522217, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8520452976226807, + "num_tokens": 155224647.0, + "step": 4257 + }, + { + "epoch": 0.7907149489322192, + "grad_norm": 1.5659185647964478, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8790842294692993, + "num_tokens": 155258426.0, + "step": 4258 + }, + { + "epoch": 0.7909006499535748, + "grad_norm": 1.6156305074691772, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8643882274627686, + "num_tokens": 155290608.0, + "step": 4259 + }, + { + "epoch": 0.7910863509749304, + "grad_norm": 1.6294989585876465, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8484616279602051, + "num_tokens": 155325792.0, + "step": 4260 + }, + { + "epoch": 0.791272051996286, + "grad_norm": 1.6972942352294922, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.858905553817749, + "num_tokens": 155359524.0, + "step": 4261 + }, + { + "epoch": 0.7914577530176417, + "grad_norm": 1.6530590057373047, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8573298454284668, + "num_tokens": 155391561.0, + "step": 4262 + }, + { + "epoch": 0.7916434540389972, + "grad_norm": 1.4747365713119507, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8716208934783936, + "num_tokens": 155425671.0, + "step": 4263 + }, + { + "epoch": 0.7918291550603528, + "grad_norm": 1.5602601766586304, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8677301406860352, + "num_tokens": 155463708.0, + "step": 4264 + }, + { + "epoch": 0.7920148560817084, + "grad_norm": 1.5291845798492432, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8672611117362976, + "num_tokens": 155504099.0, + "step": 4265 + }, + { + "epoch": 0.792200557103064, + "grad_norm": 1.582018256187439, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8599400520324707, + "num_tokens": 155539689.0, + "step": 4266 + }, + { + "epoch": 0.7923862581244197, + "grad_norm": 1.5409440994262695, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8763899803161621, + "num_tokens": 155574786.0, + "step": 4267 + }, + { + "epoch": 0.7925719591457753, + "grad_norm": 1.5155318975448608, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8795323371887207, + "num_tokens": 155608533.0, + "step": 4268 + }, + { + "epoch": 0.7927576601671309, + "grad_norm": 1.5783528089523315, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8812889456748962, + "num_tokens": 155642631.0, + "step": 4269 + }, + { + "epoch": 0.7929433611884865, + "grad_norm": 1.6224271059036255, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8417602777481079, + "num_tokens": 155683249.0, + "step": 4270 + }, + { + "epoch": 0.7931290622098421, + "grad_norm": 1.6132209300994873, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8687326908111572, + "num_tokens": 155716462.0, + "step": 4271 + }, + { + "epoch": 0.7933147632311978, + "grad_norm": 1.4495769739151, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8693642616271973, + "num_tokens": 155751151.0, + "step": 4272 + }, + { + "epoch": 0.7935004642525534, + "grad_norm": 1.4688940048217773, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8673449754714966, + "num_tokens": 155789575.0, + "step": 4273 + }, + { + "epoch": 0.793686165273909, + "grad_norm": 1.5689769983291626, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.86939537525177, + "num_tokens": 155822341.0, + "step": 4274 + }, + { + "epoch": 0.7938718662952646, + "grad_norm": 1.6834845542907715, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8665052652359009, + "num_tokens": 155852420.0, + "step": 4275 + }, + { + "epoch": 0.7940575673166202, + "grad_norm": 1.3480557203292847, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8792508840560913, + "num_tokens": 155890481.0, + "step": 4276 + }, + { + "epoch": 0.7942432683379759, + "grad_norm": 1.7199639081954956, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.875299870967865, + "num_tokens": 155920683.0, + "step": 4277 + }, + { + "epoch": 0.7944289693593315, + "grad_norm": 1.5060100555419922, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8674430847167969, + "num_tokens": 155956637.0, + "step": 4278 + }, + { + "epoch": 0.7946146703806871, + "grad_norm": 1.490239143371582, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8652340173721313, + "num_tokens": 155990933.0, + "step": 4279 + }, + { + "epoch": 0.7948003714020427, + "grad_norm": 1.5706182718276978, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.869519829750061, + "num_tokens": 156024679.0, + "step": 4280 + }, + { + "epoch": 0.7949860724233984, + "grad_norm": 1.4606889486312866, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.87687087059021, + "num_tokens": 156059277.0, + "step": 4281 + }, + { + "epoch": 0.795171773444754, + "grad_norm": 1.5663176774978638, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8590043783187866, + "num_tokens": 156097616.0, + "step": 4282 + }, + { + "epoch": 0.7953574744661096, + "grad_norm": 1.7172397375106812, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8562338948249817, + "num_tokens": 156126562.0, + "step": 4283 + }, + { + "epoch": 0.7955431754874652, + "grad_norm": 1.6314826011657715, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.873650312423706, + "num_tokens": 156158558.0, + "step": 4284 + }, + { + "epoch": 0.7957288765088208, + "grad_norm": 1.5249569416046143, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.859390139579773, + "num_tokens": 156195212.0, + "step": 4285 + }, + { + "epoch": 0.7959145775301765, + "grad_norm": 1.5307313203811646, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8729921579360962, + "num_tokens": 156230555.0, + "step": 4286 + }, + { + "epoch": 0.796100278551532, + "grad_norm": 1.5508936643600464, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8569731116294861, + "num_tokens": 156266964.0, + "step": 4287 + }, + { + "epoch": 0.7962859795728876, + "grad_norm": 1.826920509338379, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8656418323516846, + "num_tokens": 156299592.0, + "step": 4288 + }, + { + "epoch": 0.7964716805942432, + "grad_norm": 1.4118634462356567, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8740012049674988, + "num_tokens": 156339712.0, + "step": 4289 + }, + { + "epoch": 0.7966573816155988, + "grad_norm": 1.5839478969573975, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8485768437385559, + "num_tokens": 156376491.0, + "step": 4290 + }, + { + "epoch": 0.7968430826369545, + "grad_norm": 1.5642518997192383, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8570942878723145, + "num_tokens": 156412761.0, + "step": 4291 + }, + { + "epoch": 0.7970287836583101, + "grad_norm": 1.4616284370422363, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8677073121070862, + "num_tokens": 156451622.0, + "step": 4292 + }, + { + "epoch": 0.7972144846796657, + "grad_norm": 1.5498156547546387, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8725653886795044, + "num_tokens": 156484669.0, + "step": 4293 + }, + { + "epoch": 0.7974001857010213, + "grad_norm": 1.6002285480499268, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8693035840988159, + "num_tokens": 156516684.0, + "step": 4294 + }, + { + "epoch": 0.797585886722377, + "grad_norm": 1.4627230167388916, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8685055375099182, + "num_tokens": 156553804.0, + "step": 4295 + }, + { + "epoch": 0.7977715877437326, + "grad_norm": 1.4593446254730225, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8733581304550171, + "num_tokens": 156591240.0, + "step": 4296 + }, + { + "epoch": 0.7979572887650882, + "grad_norm": 1.4021825790405273, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.874639630317688, + "num_tokens": 156628536.0, + "step": 4297 + }, + { + "epoch": 0.7981429897864438, + "grad_norm": 1.5964109897613525, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8595660924911499, + "num_tokens": 156664790.0, + "step": 4298 + }, + { + "epoch": 0.7983286908077994, + "grad_norm": 1.4262968301773071, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.869519829750061, + "num_tokens": 156704686.0, + "step": 4299 + }, + { + "epoch": 0.7985143918291551, + "grad_norm": 1.4262579679489136, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8552118539810181, + "num_tokens": 156746083.0, + "step": 4300 + }, + { + "epoch": 0.7987000928505107, + "grad_norm": 1.6182011365890503, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8662210702896118, + "num_tokens": 156777436.0, + "step": 4301 + }, + { + "epoch": 0.7988857938718663, + "grad_norm": 1.6383512020111084, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8576027154922485, + "num_tokens": 156811872.0, + "step": 4302 + }, + { + "epoch": 0.7990714948932219, + "grad_norm": 1.506239891052246, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8616481423377991, + "num_tokens": 156851488.0, + "step": 4303 + }, + { + "epoch": 0.7992571959145776, + "grad_norm": 1.624696135520935, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8611726760864258, + "num_tokens": 156884171.0, + "step": 4304 + }, + { + "epoch": 0.7994428969359332, + "grad_norm": 1.4781997203826904, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8764322400093079, + "num_tokens": 156921602.0, + "step": 4305 + }, + { + "epoch": 0.7996285979572888, + "grad_norm": 1.4665460586547852, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8665599226951599, + "num_tokens": 156959769.0, + "step": 4306 + }, + { + "epoch": 0.7998142989786444, + "grad_norm": 1.6087417602539062, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8516508936882019, + "num_tokens": 156994479.0, + "step": 4307 + }, + { + "epoch": 0.8, + "grad_norm": 1.6846117973327637, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8569083213806152, + "num_tokens": 157026908.0, + "step": 4308 + }, + { + "epoch": 0.8001857010213557, + "grad_norm": 1.4631906747817993, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8630129098892212, + "num_tokens": 157062770.0, + "step": 4309 + }, + { + "epoch": 0.8003714020427113, + "grad_norm": 1.5201791524887085, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8734050989151001, + "num_tokens": 157098086.0, + "step": 4310 + }, + { + "epoch": 0.8005571030640668, + "grad_norm": 1.3721895217895508, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8881222009658813, + "num_tokens": 157133197.0, + "step": 4311 + }, + { + "epoch": 0.8007428040854224, + "grad_norm": 1.5981942415237427, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8684988021850586, + "num_tokens": 157169068.0, + "step": 4312 + }, + { + "epoch": 0.800928505106778, + "grad_norm": 1.51587975025177, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8795971274375916, + "num_tokens": 157204109.0, + "step": 4313 + }, + { + "epoch": 0.8011142061281337, + "grad_norm": 1.608554720878601, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8571174144744873, + "num_tokens": 157235614.0, + "step": 4314 + }, + { + "epoch": 0.8012999071494893, + "grad_norm": 1.5150824785232544, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8652125000953674, + "num_tokens": 157270621.0, + "step": 4315 + }, + { + "epoch": 0.8014856081708449, + "grad_norm": 1.5228378772735596, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8649919033050537, + "num_tokens": 157310027.0, + "step": 4316 + }, + { + "epoch": 0.8016713091922005, + "grad_norm": 1.4896641969680786, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8680175542831421, + "num_tokens": 157349344.0, + "step": 4317 + }, + { + "epoch": 0.8018570102135562, + "grad_norm": 1.3620586395263672, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.889613151550293, + "num_tokens": 157385921.0, + "step": 4318 + }, + { + "epoch": 0.8020427112349118, + "grad_norm": 1.5678324699401855, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8594503402709961, + "num_tokens": 157423650.0, + "step": 4319 + }, + { + "epoch": 0.8022284122562674, + "grad_norm": 1.5127822160720825, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8391141295433044, + "num_tokens": 157466542.0, + "step": 4320 + }, + { + "epoch": 0.802414113277623, + "grad_norm": 1.4980791807174683, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8706099987030029, + "num_tokens": 157504197.0, + "step": 4321 + }, + { + "epoch": 0.8025998142989786, + "grad_norm": 1.336069107055664, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8726605772972107, + "num_tokens": 157546078.0, + "step": 4322 + }, + { + "epoch": 0.8027855153203343, + "grad_norm": 1.6537152528762817, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8652158975601196, + "num_tokens": 157577547.0, + "step": 4323 + }, + { + "epoch": 0.8029712163416899, + "grad_norm": 1.5674537420272827, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8663674592971802, + "num_tokens": 157612061.0, + "step": 4324 + }, + { + "epoch": 0.8031569173630455, + "grad_norm": 1.5102524757385254, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8735361099243164, + "num_tokens": 157648980.0, + "step": 4325 + }, + { + "epoch": 0.8033426183844011, + "grad_norm": 1.4548083543777466, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8757888674736023, + "num_tokens": 157685636.0, + "step": 4326 + }, + { + "epoch": 0.8035283194057568, + "grad_norm": 1.5061739683151245, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8701467514038086, + "num_tokens": 157722440.0, + "step": 4327 + }, + { + "epoch": 0.8037140204271124, + "grad_norm": 1.4380161762237549, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8598506450653076, + "num_tokens": 157761670.0, + "step": 4328 + }, + { + "epoch": 0.803899721448468, + "grad_norm": 1.6894229650497437, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8688138723373413, + "num_tokens": 157792918.0, + "step": 4329 + }, + { + "epoch": 0.8040854224698236, + "grad_norm": 1.6463873386383057, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8602112531661987, + "num_tokens": 157827497.0, + "step": 4330 + }, + { + "epoch": 0.8042711234911792, + "grad_norm": 1.5410487651824951, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8637622594833374, + "num_tokens": 157863152.0, + "step": 4331 + }, + { + "epoch": 0.8044568245125349, + "grad_norm": 1.5305771827697754, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8604897856712341, + "num_tokens": 157902822.0, + "step": 4332 + }, + { + "epoch": 0.8046425255338905, + "grad_norm": 1.5531985759735107, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8777871131896973, + "num_tokens": 157934044.0, + "step": 4333 + }, + { + "epoch": 0.8048282265552461, + "grad_norm": 1.5695569515228271, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.857900083065033, + "num_tokens": 157969942.0, + "step": 4334 + }, + { + "epoch": 0.8050139275766016, + "grad_norm": 1.3993791341781616, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8736743330955505, + "num_tokens": 158008113.0, + "step": 4335 + }, + { + "epoch": 0.8051996285979572, + "grad_norm": 1.4513527154922485, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8823447823524475, + "num_tokens": 158042968.0, + "step": 4336 + }, + { + "epoch": 0.8053853296193129, + "grad_norm": 1.4544786214828491, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8726022839546204, + "num_tokens": 158079420.0, + "step": 4337 + }, + { + "epoch": 0.8055710306406685, + "grad_norm": 1.5785295963287354, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8651669025421143, + "num_tokens": 158114113.0, + "step": 4338 + }, + { + "epoch": 0.8057567316620241, + "grad_norm": 1.530779480934143, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.869661271572113, + "num_tokens": 158155479.0, + "step": 4339 + }, + { + "epoch": 0.8059424326833797, + "grad_norm": 1.4628556966781616, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8725442886352539, + "num_tokens": 158191164.0, + "step": 4340 + }, + { + "epoch": 0.8061281337047354, + "grad_norm": 1.407698154449463, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8767462968826294, + "num_tokens": 158234186.0, + "step": 4341 + }, + { + "epoch": 0.806313834726091, + "grad_norm": 1.572425365447998, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8600886464118958, + "num_tokens": 158269040.0, + "step": 4342 + }, + { + "epoch": 0.8064995357474466, + "grad_norm": 1.6088060140609741, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8760358095169067, + "num_tokens": 158301318.0, + "step": 4343 + }, + { + "epoch": 0.8066852367688022, + "grad_norm": 1.4592455625534058, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8796088099479675, + "num_tokens": 158340655.0, + "step": 4344 + }, + { + "epoch": 0.8068709377901578, + "grad_norm": 1.5853148698806763, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8649873733520508, + "num_tokens": 158379713.0, + "step": 4345 + }, + { + "epoch": 0.8070566388115135, + "grad_norm": 1.5556707382202148, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8527546525001526, + "num_tokens": 158418432.0, + "step": 4346 + }, + { + "epoch": 0.8072423398328691, + "grad_norm": 1.6500529050827026, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.840008020401001, + "num_tokens": 158451900.0, + "step": 4347 + }, + { + "epoch": 0.8074280408542247, + "grad_norm": 1.5411944389343262, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.860369861125946, + "num_tokens": 158489411.0, + "step": 4348 + }, + { + "epoch": 0.8076137418755803, + "grad_norm": 1.475250244140625, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8768261671066284, + "num_tokens": 158526164.0, + "step": 4349 + }, + { + "epoch": 0.807799442896936, + "grad_norm": 1.5448803901672363, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8560774326324463, + "num_tokens": 158565914.0, + "step": 4350 + }, + { + "epoch": 0.8079851439182916, + "grad_norm": 1.5627906322479248, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8594831228256226, + "num_tokens": 158604848.0, + "step": 4351 + }, + { + "epoch": 0.8081708449396472, + "grad_norm": 1.5636446475982666, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8713967800140381, + "num_tokens": 158641163.0, + "step": 4352 + }, + { + "epoch": 0.8083565459610028, + "grad_norm": 1.3333410024642944, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8609170913696289, + "num_tokens": 158688150.0, + "step": 4353 + }, + { + "epoch": 0.8085422469823584, + "grad_norm": 1.609724760055542, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.873087465763092, + "num_tokens": 158719470.0, + "step": 4354 + }, + { + "epoch": 0.8087279480037141, + "grad_norm": 1.4930686950683594, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8742950558662415, + "num_tokens": 158756641.0, + "step": 4355 + }, + { + "epoch": 0.8089136490250697, + "grad_norm": 1.5697129964828491, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8528450131416321, + "num_tokens": 158795685.0, + "step": 4356 + }, + { + "epoch": 0.8090993500464253, + "grad_norm": 1.491416335105896, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8711502552032471, + "num_tokens": 158834050.0, + "step": 4357 + }, + { + "epoch": 0.8092850510677809, + "grad_norm": 1.5734766721725464, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8604506254196167, + "num_tokens": 158873338.0, + "step": 4358 + }, + { + "epoch": 0.8094707520891364, + "grad_norm": 1.4209301471710205, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8686230182647705, + "num_tokens": 158914286.0, + "step": 4359 + }, + { + "epoch": 0.8096564531104921, + "grad_norm": 1.4420713186264038, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8737082481384277, + "num_tokens": 158952303.0, + "step": 4360 + }, + { + "epoch": 0.8098421541318477, + "grad_norm": 1.4650613069534302, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8692317008972168, + "num_tokens": 158987155.0, + "step": 4361 + }, + { + "epoch": 0.8100278551532033, + "grad_norm": 1.4371984004974365, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8774532079696655, + "num_tokens": 159024691.0, + "step": 4362 + }, + { + "epoch": 0.8102135561745589, + "grad_norm": 1.3899857997894287, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8813093900680542, + "num_tokens": 159062252.0, + "step": 4363 + }, + { + "epoch": 0.8103992571959145, + "grad_norm": 1.554732322692871, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8731346130371094, + "num_tokens": 159098117.0, + "step": 4364 + }, + { + "epoch": 0.8105849582172702, + "grad_norm": 1.687057375907898, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8520195484161377, + "num_tokens": 159135179.0, + "step": 4365 + }, + { + "epoch": 0.8107706592386258, + "grad_norm": 1.4394396543502808, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8630466461181641, + "num_tokens": 159176340.0, + "step": 4366 + }, + { + "epoch": 0.8109563602599814, + "grad_norm": 1.3725383281707764, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.869876503944397, + "num_tokens": 159219448.0, + "step": 4367 + }, + { + "epoch": 0.811142061281337, + "grad_norm": 1.6500452756881714, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8539562225341797, + "num_tokens": 159250806.0, + "step": 4368 + }, + { + "epoch": 0.8113277623026927, + "grad_norm": 1.3954782485961914, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8698676824569702, + "num_tokens": 159293259.0, + "step": 4369 + }, + { + "epoch": 0.8115134633240483, + "grad_norm": 1.4760881662368774, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8649605512619019, + "num_tokens": 159329339.0, + "step": 4370 + }, + { + "epoch": 0.8116991643454039, + "grad_norm": 1.5024526119232178, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8743280172348022, + "num_tokens": 159366013.0, + "step": 4371 + }, + { + "epoch": 0.8118848653667595, + "grad_norm": 1.391858458518982, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8592361211776733, + "num_tokens": 159410726.0, + "step": 4372 + }, + { + "epoch": 0.8120705663881151, + "grad_norm": 1.5220063924789429, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8630543947219849, + "num_tokens": 159447694.0, + "step": 4373 + }, + { + "epoch": 0.8122562674094708, + "grad_norm": 1.5794434547424316, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8575243353843689, + "num_tokens": 159487389.0, + "step": 4374 + }, + { + "epoch": 0.8124419684308264, + "grad_norm": 1.5860810279846191, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8826050162315369, + "num_tokens": 159520959.0, + "step": 4375 + }, + { + "epoch": 0.812627669452182, + "grad_norm": 1.4305803775787354, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8729540109634399, + "num_tokens": 159561697.0, + "step": 4376 + }, + { + "epoch": 0.8128133704735376, + "grad_norm": 1.412978172302246, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8787452578544617, + "num_tokens": 159598514.0, + "step": 4377 + }, + { + "epoch": 0.8129990714948933, + "grad_norm": 1.5970834493637085, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.869968056678772, + "num_tokens": 159631547.0, + "step": 4378 + }, + { + "epoch": 0.8131847725162489, + "grad_norm": 1.7511802911758423, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8516591787338257, + "num_tokens": 159662804.0, + "step": 4379 + }, + { + "epoch": 0.8133704735376045, + "grad_norm": 1.5239930152893066, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8712980151176453, + "num_tokens": 159700003.0, + "step": 4380 + }, + { + "epoch": 0.8135561745589601, + "grad_norm": 1.4064302444458008, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8604908585548401, + "num_tokens": 159742928.0, + "step": 4381 + }, + { + "epoch": 0.8137418755803157, + "grad_norm": 1.6090604066848755, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8614596128463745, + "num_tokens": 159778679.0, + "step": 4382 + }, + { + "epoch": 0.8139275766016713, + "grad_norm": 1.4484529495239258, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8778481483459473, + "num_tokens": 159816353.0, + "step": 4383 + }, + { + "epoch": 0.8141132776230269, + "grad_norm": 1.5776163339614868, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8654576539993286, + "num_tokens": 159851874.0, + "step": 4384 + }, + { + "epoch": 0.8142989786443825, + "grad_norm": 1.4290788173675537, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8528772592544556, + "num_tokens": 159895286.0, + "step": 4385 + }, + { + "epoch": 0.8144846796657381, + "grad_norm": 1.50492262840271, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8730236291885376, + "num_tokens": 159929167.0, + "step": 4386 + }, + { + "epoch": 0.8146703806870937, + "grad_norm": 1.4944405555725098, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8651376366615295, + "num_tokens": 159967351.0, + "step": 4387 + }, + { + "epoch": 0.8148560817084494, + "grad_norm": 1.5160908699035645, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8807441592216492, + "num_tokens": 160003282.0, + "step": 4388 + }, + { + "epoch": 0.815041782729805, + "grad_norm": 1.678189754486084, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8622167706489563, + "num_tokens": 160038949.0, + "step": 4389 + }, + { + "epoch": 0.8152274837511606, + "grad_norm": 1.6270079612731934, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8522655963897705, + "num_tokens": 160070661.0, + "step": 4390 + }, + { + "epoch": 0.8154131847725162, + "grad_norm": 1.64986252784729, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.87522953748703, + "num_tokens": 160103413.0, + "step": 4391 + }, + { + "epoch": 0.8155988857938719, + "grad_norm": 1.530263066291809, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8844796419143677, + "num_tokens": 160136987.0, + "step": 4392 + }, + { + "epoch": 0.8157845868152275, + "grad_norm": 1.5153776407241821, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8733962774276733, + "num_tokens": 160170757.0, + "step": 4393 + }, + { + "epoch": 0.8159702878365831, + "grad_norm": 1.5232619047164917, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.865730881690979, + "num_tokens": 160206482.0, + "step": 4394 + }, + { + "epoch": 0.8161559888579387, + "grad_norm": 1.4675474166870117, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8648719191551208, + "num_tokens": 160245075.0, + "step": 4395 + }, + { + "epoch": 0.8163416898792943, + "grad_norm": 1.6537277698516846, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.85776686668396, + "num_tokens": 160279263.0, + "step": 4396 + }, + { + "epoch": 0.81652739090065, + "grad_norm": 1.4526114463806152, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8599447011947632, + "num_tokens": 160320705.0, + "step": 4397 + }, + { + "epoch": 0.8167130919220056, + "grad_norm": 1.541061520576477, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8587109446525574, + "num_tokens": 160358051.0, + "step": 4398 + }, + { + "epoch": 0.8168987929433612, + "grad_norm": 1.4213054180145264, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8652417659759521, + "num_tokens": 160401204.0, + "step": 4399 + }, + { + "epoch": 0.8170844939647168, + "grad_norm": 1.4288562536239624, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8844435811042786, + "num_tokens": 160436986.0, + "step": 4400 + }, + { + "epoch": 0.8172701949860725, + "grad_norm": 1.7365435361862183, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8641987442970276, + "num_tokens": 160467265.0, + "step": 4401 + }, + { + "epoch": 0.8174558960074281, + "grad_norm": 1.3913863897323608, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8772029280662537, + "num_tokens": 160505700.0, + "step": 4402 + }, + { + "epoch": 0.8176415970287837, + "grad_norm": 1.4647164344787598, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8659013509750366, + "num_tokens": 160545340.0, + "step": 4403 + }, + { + "epoch": 0.8178272980501393, + "grad_norm": 1.4599645137786865, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8782066702842712, + "num_tokens": 160581127.0, + "step": 4404 + }, + { + "epoch": 0.8180129990714949, + "grad_norm": 1.425286054611206, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8798731565475464, + "num_tokens": 160616013.0, + "step": 4405 + }, + { + "epoch": 0.8181987000928506, + "grad_norm": 1.6003459692001343, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8681113123893738, + "num_tokens": 160650233.0, + "step": 4406 + }, + { + "epoch": 0.8183844011142062, + "grad_norm": 1.60861074924469, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8688862323760986, + "num_tokens": 160685294.0, + "step": 4407 + }, + { + "epoch": 0.8185701021355617, + "grad_norm": 1.6896342039108276, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8678441643714905, + "num_tokens": 160716949.0, + "step": 4408 + }, + { + "epoch": 0.8187558031569173, + "grad_norm": 1.522316813468933, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8659149408340454, + "num_tokens": 160752088.0, + "step": 4409 + }, + { + "epoch": 0.8189415041782729, + "grad_norm": 1.4504876136779785, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8590008020401001, + "num_tokens": 160791277.0, + "step": 4410 + }, + { + "epoch": 0.8191272051996286, + "grad_norm": 1.3923791646957397, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8716808557510376, + "num_tokens": 160838512.0, + "step": 4411 + }, + { + "epoch": 0.8193129062209842, + "grad_norm": 1.551408052444458, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8694687485694885, + "num_tokens": 160873789.0, + "step": 4412 + }, + { + "epoch": 0.8194986072423398, + "grad_norm": 1.5530864000320435, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8538832664489746, + "num_tokens": 160912066.0, + "step": 4413 + }, + { + "epoch": 0.8196843082636954, + "grad_norm": 1.4120343923568726, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8754185438156128, + "num_tokens": 160949794.0, + "step": 4414 + }, + { + "epoch": 0.819870009285051, + "grad_norm": 1.5273206233978271, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8557196855545044, + "num_tokens": 160989018.0, + "step": 4415 + }, + { + "epoch": 0.8200557103064067, + "grad_norm": 1.5684189796447754, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8579322695732117, + "num_tokens": 161023897.0, + "step": 4416 + }, + { + "epoch": 0.8202414113277623, + "grad_norm": 1.5868785381317139, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8752211928367615, + "num_tokens": 161062657.0, + "step": 4417 + }, + { + "epoch": 0.8204271123491179, + "grad_norm": 1.720775842666626, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.867612361907959, + "num_tokens": 161096798.0, + "step": 4418 + }, + { + "epoch": 0.8206128133704735, + "grad_norm": 1.520796298980713, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8603786826133728, + "num_tokens": 161132327.0, + "step": 4419 + }, + { + "epoch": 0.8207985143918292, + "grad_norm": 1.6660199165344238, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.862742006778717, + "num_tokens": 161163418.0, + "step": 4420 + }, + { + "epoch": 0.8209842154131848, + "grad_norm": 1.4114264249801636, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8702205419540405, + "num_tokens": 161203030.0, + "step": 4421 + }, + { + "epoch": 0.8211699164345404, + "grad_norm": 1.3584312200546265, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8802610635757446, + "num_tokens": 161241464.0, + "step": 4422 + }, + { + "epoch": 0.821355617455896, + "grad_norm": 1.5656788349151611, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8537704944610596, + "num_tokens": 161278462.0, + "step": 4423 + }, + { + "epoch": 0.8215413184772516, + "grad_norm": 1.5740725994110107, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8499287366867065, + "num_tokens": 161316880.0, + "step": 4424 + }, + { + "epoch": 0.8217270194986073, + "grad_norm": 1.6320509910583496, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8704602718353271, + "num_tokens": 161350021.0, + "step": 4425 + }, + { + "epoch": 0.8219127205199629, + "grad_norm": 1.5627737045288086, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8685410022735596, + "num_tokens": 161386530.0, + "step": 4426 + }, + { + "epoch": 0.8220984215413185, + "grad_norm": 1.6801643371582031, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8697314858436584, + "num_tokens": 161418800.0, + "step": 4427 + }, + { + "epoch": 0.8222841225626741, + "grad_norm": 1.459120273590088, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8659734725952148, + "num_tokens": 161457351.0, + "step": 4428 + }, + { + "epoch": 0.8224698235840298, + "grad_norm": 1.613312005996704, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8754850625991821, + "num_tokens": 161490960.0, + "step": 4429 + }, + { + "epoch": 0.8226555246053854, + "grad_norm": 1.4797554016113281, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8665233850479126, + "num_tokens": 161531286.0, + "step": 4430 + }, + { + "epoch": 0.822841225626741, + "grad_norm": 1.5500506162643433, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8778242468833923, + "num_tokens": 161564479.0, + "step": 4431 + }, + { + "epoch": 0.8230269266480965, + "grad_norm": 1.568028450012207, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8598816394805908, + "num_tokens": 161599150.0, + "step": 4432 + }, + { + "epoch": 0.8232126276694521, + "grad_norm": 1.4776352643966675, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.871241569519043, + "num_tokens": 161638748.0, + "step": 4433 + }, + { + "epoch": 0.8233983286908078, + "grad_norm": 1.490055799484253, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.861174464225769, + "num_tokens": 161678075.0, + "step": 4434 + }, + { + "epoch": 0.8235840297121634, + "grad_norm": 1.5513832569122314, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8645132780075073, + "num_tokens": 161710003.0, + "step": 4435 + }, + { + "epoch": 0.823769730733519, + "grad_norm": 1.4693065881729126, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8771657943725586, + "num_tokens": 161747183.0, + "step": 4436 + }, + { + "epoch": 0.8239554317548746, + "grad_norm": 1.523029088973999, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.851543128490448, + "num_tokens": 161785778.0, + "step": 4437 + }, + { + "epoch": 0.8241411327762302, + "grad_norm": 1.4049230813980103, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8711178302764893, + "num_tokens": 161823737.0, + "step": 4438 + }, + { + "epoch": 0.8243268337975859, + "grad_norm": 1.512332797050476, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8789929747581482, + "num_tokens": 161857021.0, + "step": 4439 + }, + { + "epoch": 0.8245125348189415, + "grad_norm": 1.5961098670959473, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8715110421180725, + "num_tokens": 161885351.0, + "step": 4440 + }, + { + "epoch": 0.8246982358402971, + "grad_norm": 1.5070021152496338, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8628423810005188, + "num_tokens": 161925395.0, + "step": 4441 + }, + { + "epoch": 0.8248839368616527, + "grad_norm": 1.5211663246154785, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8467214107513428, + "num_tokens": 161965221.0, + "step": 4442 + }, + { + "epoch": 0.8250696378830084, + "grad_norm": 1.4484617710113525, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.871396005153656, + "num_tokens": 162003651.0, + "step": 4443 + }, + { + "epoch": 0.825255338904364, + "grad_norm": 1.5366569757461548, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8636787533760071, + "num_tokens": 162045435.0, + "step": 4444 + }, + { + "epoch": 0.8254410399257196, + "grad_norm": 1.5965148210525513, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8681790232658386, + "num_tokens": 162076657.0, + "step": 4445 + }, + { + "epoch": 0.8256267409470752, + "grad_norm": 1.3873299360275269, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8719862103462219, + "num_tokens": 162116666.0, + "step": 4446 + }, + { + "epoch": 0.8258124419684308, + "grad_norm": 1.5485754013061523, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8412141799926758, + "num_tokens": 162155059.0, + "step": 4447 + }, + { + "epoch": 0.8259981429897865, + "grad_norm": 1.4185651540756226, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8686252236366272, + "num_tokens": 162195210.0, + "step": 4448 + }, + { + "epoch": 0.8261838440111421, + "grad_norm": 1.4698154926300049, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8581966161727905, + "num_tokens": 162240949.0, + "step": 4449 + }, + { + "epoch": 0.8263695450324977, + "grad_norm": 1.6732821464538574, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8736514449119568, + "num_tokens": 162271023.0, + "step": 4450 + }, + { + "epoch": 0.8265552460538533, + "grad_norm": 1.676073670387268, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8338172435760498, + "num_tokens": 162307732.0, + "step": 4451 + }, + { + "epoch": 0.826740947075209, + "grad_norm": 1.560293436050415, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8726905584335327, + "num_tokens": 162341948.0, + "step": 4452 + }, + { + "epoch": 0.8269266480965646, + "grad_norm": 1.7614680528640747, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8577431440353394, + "num_tokens": 162370428.0, + "step": 4453 + }, + { + "epoch": 0.8271123491179202, + "grad_norm": 1.4810781478881836, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8722622394561768, + "num_tokens": 162413369.0, + "step": 4454 + }, + { + "epoch": 0.8272980501392758, + "grad_norm": 1.5220941305160522, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8582457304000854, + "num_tokens": 162456583.0, + "step": 4455 + }, + { + "epoch": 0.8274837511606313, + "grad_norm": 1.413385272026062, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8675047159194946, + "num_tokens": 162497980.0, + "step": 4456 + }, + { + "epoch": 0.827669452181987, + "grad_norm": 1.5873295068740845, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8716996908187866, + "num_tokens": 162530205.0, + "step": 4457 + }, + { + "epoch": 0.8278551532033426, + "grad_norm": 1.540108323097229, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8612102270126343, + "num_tokens": 162565247.0, + "step": 4458 + }, + { + "epoch": 0.8280408542246982, + "grad_norm": 1.5902422666549683, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8594802618026733, + "num_tokens": 162597538.0, + "step": 4459 + }, + { + "epoch": 0.8282265552460538, + "grad_norm": 1.6544286012649536, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8514112234115601, + "num_tokens": 162631917.0, + "step": 4460 + }, + { + "epoch": 0.8284122562674094, + "grad_norm": 1.6493805646896362, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8830804228782654, + "num_tokens": 162665732.0, + "step": 4461 + }, + { + "epoch": 0.8285979572887651, + "grad_norm": 1.62562096118927, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8712383508682251, + "num_tokens": 162695899.0, + "step": 4462 + }, + { + "epoch": 0.8287836583101207, + "grad_norm": 1.6146804094314575, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8546397089958191, + "num_tokens": 162732269.0, + "step": 4463 + }, + { + "epoch": 0.8289693593314763, + "grad_norm": 1.5251461267471313, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8581138253211975, + "num_tokens": 162768433.0, + "step": 4464 + }, + { + "epoch": 0.8291550603528319, + "grad_norm": 1.4291372299194336, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.89338219165802, + "num_tokens": 162803120.0, + "step": 4465 + }, + { + "epoch": 0.8293407613741876, + "grad_norm": 1.5290234088897705, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8679112195968628, + "num_tokens": 162840094.0, + "step": 4466 + }, + { + "epoch": 0.8295264623955432, + "grad_norm": 1.5523823499679565, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8408204317092896, + "num_tokens": 162883633.0, + "step": 4467 + }, + { + "epoch": 0.8297121634168988, + "grad_norm": 1.4089998006820679, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8646234273910522, + "num_tokens": 162923329.0, + "step": 4468 + }, + { + "epoch": 0.8298978644382544, + "grad_norm": 1.5249031782150269, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8653584122657776, + "num_tokens": 162958818.0, + "step": 4469 + }, + { + "epoch": 0.83008356545961, + "grad_norm": 1.468916893005371, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.88039231300354, + "num_tokens": 162992346.0, + "step": 4470 + }, + { + "epoch": 0.8302692664809657, + "grad_norm": 1.483718991279602, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8611186742782593, + "num_tokens": 163029729.0, + "step": 4471 + }, + { + "epoch": 0.8304549675023213, + "grad_norm": 1.457736611366272, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8827160000801086, + "num_tokens": 163065302.0, + "step": 4472 + }, + { + "epoch": 0.8306406685236769, + "grad_norm": 1.4146088361740112, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8741018772125244, + "num_tokens": 163102750.0, + "step": 4473 + }, + { + "epoch": 0.8308263695450325, + "grad_norm": 1.4086233377456665, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8802127242088318, + "num_tokens": 163141283.0, + "step": 4474 + }, + { + "epoch": 0.8310120705663882, + "grad_norm": 1.4740363359451294, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.884882926940918, + "num_tokens": 163173807.0, + "step": 4475 + }, + { + "epoch": 0.8311977715877438, + "grad_norm": 1.6351702213287354, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8523685932159424, + "num_tokens": 163207357.0, + "step": 4476 + }, + { + "epoch": 0.8313834726090994, + "grad_norm": 1.6007400751113892, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8582108020782471, + "num_tokens": 163246114.0, + "step": 4477 + }, + { + "epoch": 0.831569173630455, + "grad_norm": 1.4935166835784912, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8623567819595337, + "num_tokens": 163285954.0, + "step": 4478 + }, + { + "epoch": 0.8317548746518106, + "grad_norm": 1.4788086414337158, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8725292682647705, + "num_tokens": 163320009.0, + "step": 4479 + }, + { + "epoch": 0.8319405756731661, + "grad_norm": 1.5333807468414307, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8670333623886108, + "num_tokens": 163358286.0, + "step": 4480 + }, + { + "epoch": 0.8321262766945218, + "grad_norm": 1.6011126041412354, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8607580065727234, + "num_tokens": 163394797.0, + "step": 4481 + }, + { + "epoch": 0.8323119777158774, + "grad_norm": 1.6212360858917236, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.881132960319519, + "num_tokens": 163426778.0, + "step": 4482 + }, + { + "epoch": 0.832497678737233, + "grad_norm": 1.5061707496643066, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8561774492263794, + "num_tokens": 163464515.0, + "step": 4483 + }, + { + "epoch": 0.8326833797585886, + "grad_norm": 1.4256806373596191, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8690445423126221, + "num_tokens": 163503142.0, + "step": 4484 + }, + { + "epoch": 0.8328690807799443, + "grad_norm": 1.3454891443252563, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8801947832107544, + "num_tokens": 163544616.0, + "step": 4485 + }, + { + "epoch": 0.8330547818012999, + "grad_norm": 1.6411794424057007, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8573052883148193, + "num_tokens": 163581601.0, + "step": 4486 + }, + { + "epoch": 0.8332404828226555, + "grad_norm": 1.636495590209961, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8714510202407837, + "num_tokens": 163614738.0, + "step": 4487 + }, + { + "epoch": 0.8334261838440111, + "grad_norm": 1.6771018505096436, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8480499982833862, + "num_tokens": 163646141.0, + "step": 4488 + }, + { + "epoch": 0.8336118848653667, + "grad_norm": 1.4988477230072021, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.882737934589386, + "num_tokens": 163683471.0, + "step": 4489 + }, + { + "epoch": 0.8337975858867224, + "grad_norm": 1.4016462564468384, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.879788339138031, + "num_tokens": 163721515.0, + "step": 4490 + }, + { + "epoch": 0.833983286908078, + "grad_norm": 1.6583110094070435, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8655291795730591, + "num_tokens": 163753121.0, + "step": 4491 + }, + { + "epoch": 0.8341689879294336, + "grad_norm": 1.76653254032135, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8718252182006836, + "num_tokens": 163783560.0, + "step": 4492 + }, + { + "epoch": 0.8343546889507892, + "grad_norm": 1.5030782222747803, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8593662977218628, + "num_tokens": 163822785.0, + "step": 4493 + }, + { + "epoch": 0.8345403899721449, + "grad_norm": 1.5830391645431519, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8712161183357239, + "num_tokens": 163855156.0, + "step": 4494 + }, + { + "epoch": 0.8347260909935005, + "grad_norm": 1.5055826902389526, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8606014251708984, + "num_tokens": 163891475.0, + "step": 4495 + }, + { + "epoch": 0.8349117920148561, + "grad_norm": 1.4924147129058838, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.866050124168396, + "num_tokens": 163926542.0, + "step": 4496 + }, + { + "epoch": 0.8350974930362117, + "grad_norm": 1.5420089960098267, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8634409308433533, + "num_tokens": 163961440.0, + "step": 4497 + }, + { + "epoch": 0.8352831940575673, + "grad_norm": 1.5688419342041016, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8511619567871094, + "num_tokens": 163996770.0, + "step": 4498 + }, + { + "epoch": 0.835468895078923, + "grad_norm": 1.5065561532974243, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8715231418609619, + "num_tokens": 164029867.0, + "step": 4499 + }, + { + "epoch": 0.8356545961002786, + "grad_norm": 1.6735714673995972, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8460416793823242, + "num_tokens": 164065618.0, + "step": 4500 + }, + { + "epoch": 0.8358402971216342, + "grad_norm": 1.4777488708496094, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8739219903945923, + "num_tokens": 164108692.0, + "step": 4501 + }, + { + "epoch": 0.8360259981429898, + "grad_norm": 1.6512210369110107, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8639179468154907, + "num_tokens": 164142723.0, + "step": 4502 + }, + { + "epoch": 0.8362116991643455, + "grad_norm": 1.484415054321289, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8729467391967773, + "num_tokens": 164177594.0, + "step": 4503 + }, + { + "epoch": 0.836397400185701, + "grad_norm": 1.5356671810150146, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8629513382911682, + "num_tokens": 164215097.0, + "step": 4504 + }, + { + "epoch": 0.8365831012070566, + "grad_norm": 1.4793676137924194, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8720961809158325, + "num_tokens": 164256665.0, + "step": 4505 + }, + { + "epoch": 0.8367688022284122, + "grad_norm": 1.4976415634155273, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8645294904708862, + "num_tokens": 164293182.0, + "step": 4506 + }, + { + "epoch": 0.8369545032497678, + "grad_norm": 1.5107686519622803, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.863922119140625, + "num_tokens": 164332026.0, + "step": 4507 + }, + { + "epoch": 0.8371402042711235, + "grad_norm": 1.5150877237319946, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8690868616104126, + "num_tokens": 164370441.0, + "step": 4508 + }, + { + "epoch": 0.8373259052924791, + "grad_norm": 1.417863130569458, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8716323375701904, + "num_tokens": 164409095.0, + "step": 4509 + }, + { + "epoch": 0.8375116063138347, + "grad_norm": 1.573163628578186, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8697893619537354, + "num_tokens": 164442911.0, + "step": 4510 + }, + { + "epoch": 0.8376973073351903, + "grad_norm": 1.3995702266693115, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8748520612716675, + "num_tokens": 164484052.0, + "step": 4511 + }, + { + "epoch": 0.8378830083565459, + "grad_norm": 1.626015067100525, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8610194325447083, + "num_tokens": 164516542.0, + "step": 4512 + }, + { + "epoch": 0.8380687093779016, + "grad_norm": 1.4281606674194336, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8929046392440796, + "num_tokens": 164553113.0, + "step": 4513 + }, + { + "epoch": 0.8382544103992572, + "grad_norm": 1.5887799263000488, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.873640775680542, + "num_tokens": 164590320.0, + "step": 4514 + }, + { + "epoch": 0.8384401114206128, + "grad_norm": 1.5134536027908325, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8798348903656006, + "num_tokens": 164629761.0, + "step": 4515 + }, + { + "epoch": 0.8386258124419684, + "grad_norm": 1.478723168373108, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8734577298164368, + "num_tokens": 164667214.0, + "step": 4516 + }, + { + "epoch": 0.838811513463324, + "grad_norm": 1.6080645322799683, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8633495569229126, + "num_tokens": 164699782.0, + "step": 4517 + }, + { + "epoch": 0.8389972144846797, + "grad_norm": 1.51956045627594, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8650740385055542, + "num_tokens": 164736774.0, + "step": 4518 + }, + { + "epoch": 0.8391829155060353, + "grad_norm": 1.3680412769317627, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8741569519042969, + "num_tokens": 164776226.0, + "step": 4519 + }, + { + "epoch": 0.8393686165273909, + "grad_norm": 1.5200345516204834, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8691226840019226, + "num_tokens": 164813969.0, + "step": 4520 + }, + { + "epoch": 0.8395543175487465, + "grad_norm": 1.4498286247253418, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8675320148468018, + "num_tokens": 164858002.0, + "step": 4521 + }, + { + "epoch": 0.8397400185701022, + "grad_norm": 1.4976661205291748, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8596887588500977, + "num_tokens": 164896944.0, + "step": 4522 + }, + { + "epoch": 0.8399257195914578, + "grad_norm": 1.4822709560394287, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8706021308898926, + "num_tokens": 164932484.0, + "step": 4523 + }, + { + "epoch": 0.8401114206128134, + "grad_norm": 1.504775881767273, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8854271173477173, + "num_tokens": 164965771.0, + "step": 4524 + }, + { + "epoch": 0.840297121634169, + "grad_norm": 1.8462648391723633, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.877056360244751, + "num_tokens": 164997640.0, + "step": 4525 + }, + { + "epoch": 0.8404828226555247, + "grad_norm": 1.5099873542785645, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8822047710418701, + "num_tokens": 165033285.0, + "step": 4526 + }, + { + "epoch": 0.8406685236768803, + "grad_norm": 1.6327654123306274, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8588013052940369, + "num_tokens": 165071130.0, + "step": 4527 + }, + { + "epoch": 0.8408542246982358, + "grad_norm": 1.4999107122421265, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8839070796966553, + "num_tokens": 165104974.0, + "step": 4528 + }, + { + "epoch": 0.8410399257195914, + "grad_norm": 1.6783814430236816, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.85048508644104, + "num_tokens": 165141009.0, + "step": 4529 + }, + { + "epoch": 0.841225626740947, + "grad_norm": 1.5042539834976196, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8759633302688599, + "num_tokens": 165177528.0, + "step": 4530 + }, + { + "epoch": 0.8414113277623027, + "grad_norm": 1.4485002756118774, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8658334016799927, + "num_tokens": 165219224.0, + "step": 4531 + }, + { + "epoch": 0.8415970287836583, + "grad_norm": 1.5187807083129883, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8791971206665039, + "num_tokens": 165255993.0, + "step": 4532 + }, + { + "epoch": 0.8417827298050139, + "grad_norm": 1.4784215688705444, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8744825124740601, + "num_tokens": 165292551.0, + "step": 4533 + }, + { + "epoch": 0.8419684308263695, + "grad_norm": 1.4314101934432983, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8674124479293823, + "num_tokens": 165333269.0, + "step": 4534 + }, + { + "epoch": 0.8421541318477251, + "grad_norm": 1.5146174430847168, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8809938430786133, + "num_tokens": 165366716.0, + "step": 4535 + }, + { + "epoch": 0.8423398328690808, + "grad_norm": 1.7386773824691772, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8511018753051758, + "num_tokens": 165396770.0, + "step": 4536 + }, + { + "epoch": 0.8425255338904364, + "grad_norm": 1.4724246263504028, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8704451322555542, + "num_tokens": 165437272.0, + "step": 4537 + }, + { + "epoch": 0.842711234911792, + "grad_norm": 1.4923155307769775, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8765597343444824, + "num_tokens": 165473858.0, + "step": 4538 + }, + { + "epoch": 0.8428969359331476, + "grad_norm": 1.5831607580184937, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8546924591064453, + "num_tokens": 165512101.0, + "step": 4539 + }, + { + "epoch": 0.8430826369545033, + "grad_norm": 1.4986884593963623, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8614352941513062, + "num_tokens": 165550358.0, + "step": 4540 + }, + { + "epoch": 0.8432683379758589, + "grad_norm": 1.53452467918396, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8648552298545837, + "num_tokens": 165583984.0, + "step": 4541 + }, + { + "epoch": 0.8434540389972145, + "grad_norm": 1.5416990518569946, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8696407675743103, + "num_tokens": 165626988.0, + "step": 4542 + }, + { + "epoch": 0.8436397400185701, + "grad_norm": 1.6829555034637451, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8556210398674011, + "num_tokens": 165664117.0, + "step": 4543 + }, + { + "epoch": 0.8438254410399257, + "grad_norm": 1.6276805400848389, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8727490901947021, + "num_tokens": 165694693.0, + "step": 4544 + }, + { + "epoch": 0.8440111420612814, + "grad_norm": 1.5215644836425781, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.860589325428009, + "num_tokens": 165733856.0, + "step": 4545 + }, + { + "epoch": 0.844196843082637, + "grad_norm": 1.6031713485717773, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8746522068977356, + "num_tokens": 165770758.0, + "step": 4546 + }, + { + "epoch": 0.8443825441039926, + "grad_norm": 1.5486109256744385, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.857446551322937, + "num_tokens": 165806877.0, + "step": 4547 + }, + { + "epoch": 0.8445682451253482, + "grad_norm": 1.5263113975524902, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.866411566734314, + "num_tokens": 165843447.0, + "step": 4548 + }, + { + "epoch": 0.8447539461467038, + "grad_norm": 1.4906929731369019, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8886491060256958, + "num_tokens": 165879000.0, + "step": 4549 + }, + { + "epoch": 0.8449396471680595, + "grad_norm": 1.5267744064331055, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8651996850967407, + "num_tokens": 165915695.0, + "step": 4550 + }, + { + "epoch": 0.8451253481894151, + "grad_norm": 1.5688475370407104, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8806173801422119, + "num_tokens": 165948686.0, + "step": 4551 + }, + { + "epoch": 0.8453110492107706, + "grad_norm": 1.5345476865768433, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8574249148368835, + "num_tokens": 165986976.0, + "step": 4552 + }, + { + "epoch": 0.8454967502321262, + "grad_norm": 1.556434988975525, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8630735874176025, + "num_tokens": 166020718.0, + "step": 4553 + }, + { + "epoch": 0.8456824512534818, + "grad_norm": 1.4925767183303833, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8552469611167908, + "num_tokens": 166058787.0, + "step": 4554 + }, + { + "epoch": 0.8458681522748375, + "grad_norm": 1.470572590827942, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8755160570144653, + "num_tokens": 166093513.0, + "step": 4555 + }, + { + "epoch": 0.8460538532961931, + "grad_norm": 1.473218321800232, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8679941296577454, + "num_tokens": 166130710.0, + "step": 4556 + }, + { + "epoch": 0.8462395543175487, + "grad_norm": 1.5019053220748901, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8619822859764099, + "num_tokens": 166167545.0, + "step": 4557 + }, + { + "epoch": 0.8464252553389043, + "grad_norm": 1.4550751447677612, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8751344084739685, + "num_tokens": 166203674.0, + "step": 4558 + }, + { + "epoch": 0.84661095636026, + "grad_norm": 1.5163445472717285, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.851071834564209, + "num_tokens": 166241787.0, + "step": 4559 + }, + { + "epoch": 0.8467966573816156, + "grad_norm": 1.6405365467071533, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8658562898635864, + "num_tokens": 166280340.0, + "step": 4560 + }, + { + "epoch": 0.8469823584029712, + "grad_norm": 1.3883872032165527, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8810946941375732, + "num_tokens": 166322227.0, + "step": 4561 + }, + { + "epoch": 0.8471680594243268, + "grad_norm": 1.4858709573745728, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8607706427574158, + "num_tokens": 166358384.0, + "step": 4562 + }, + { + "epoch": 0.8473537604456824, + "grad_norm": 1.445717453956604, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8775571584701538, + "num_tokens": 166400256.0, + "step": 4563 + }, + { + "epoch": 0.8475394614670381, + "grad_norm": 1.4991093873977661, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8541430234909058, + "num_tokens": 166438426.0, + "step": 4564 + }, + { + "epoch": 0.8477251624883937, + "grad_norm": 1.630436897277832, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8552674055099487, + "num_tokens": 166471805.0, + "step": 4565 + }, + { + "epoch": 0.8479108635097493, + "grad_norm": 1.4936975240707397, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.87880539894104, + "num_tokens": 166508155.0, + "step": 4566 + }, + { + "epoch": 0.8480965645311049, + "grad_norm": 1.4934026002883911, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.870132327079773, + "num_tokens": 166543113.0, + "step": 4567 + }, + { + "epoch": 0.8482822655524606, + "grad_norm": 1.465724229812622, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.882026195526123, + "num_tokens": 166576137.0, + "step": 4568 + }, + { + "epoch": 0.8484679665738162, + "grad_norm": 1.7125709056854248, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8532787561416626, + "num_tokens": 166609361.0, + "step": 4569 + }, + { + "epoch": 0.8486536675951718, + "grad_norm": 1.6528979539871216, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8742590546607971, + "num_tokens": 166641969.0, + "step": 4570 + }, + { + "epoch": 0.8488393686165274, + "grad_norm": 1.5563658475875854, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8517792820930481, + "num_tokens": 166680266.0, + "step": 4571 + }, + { + "epoch": 0.849025069637883, + "grad_norm": 1.6548881530761719, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8641121983528137, + "num_tokens": 166717234.0, + "step": 4572 + }, + { + "epoch": 0.8492107706592387, + "grad_norm": 1.5849460363388062, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8560960292816162, + "num_tokens": 166750622.0, + "step": 4573 + }, + { + "epoch": 0.8493964716805943, + "grad_norm": 1.4731242656707764, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8826090693473816, + "num_tokens": 166788034.0, + "step": 4574 + }, + { + "epoch": 0.8495821727019499, + "grad_norm": 1.4327259063720703, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8816988468170166, + "num_tokens": 166828248.0, + "step": 4575 + }, + { + "epoch": 0.8497678737233055, + "grad_norm": 1.6399272680282593, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8590559959411621, + "num_tokens": 166861530.0, + "step": 4576 + }, + { + "epoch": 0.849953574744661, + "grad_norm": 1.7281029224395752, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8733274340629578, + "num_tokens": 166892441.0, + "step": 4577 + }, + { + "epoch": 0.8501392757660167, + "grad_norm": 1.5510814189910889, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8738013505935669, + "num_tokens": 166928098.0, + "step": 4578 + }, + { + "epoch": 0.8503249767873723, + "grad_norm": 1.646226406097412, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8686168193817139, + "num_tokens": 166959144.0, + "step": 4579 + }, + { + "epoch": 0.8505106778087279, + "grad_norm": 1.5627617835998535, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8489587306976318, + "num_tokens": 166997284.0, + "step": 4580 + }, + { + "epoch": 0.8506963788300835, + "grad_norm": 1.5245413780212402, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8589221239089966, + "num_tokens": 167036109.0, + "step": 4581 + }, + { + "epoch": 0.8508820798514392, + "grad_norm": 1.4072339534759521, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8684495687484741, + "num_tokens": 167077540.0, + "step": 4582 + }, + { + "epoch": 0.8510677808727948, + "grad_norm": 1.5037262439727783, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8665685057640076, + "num_tokens": 167113301.0, + "step": 4583 + }, + { + "epoch": 0.8512534818941504, + "grad_norm": 1.5919352769851685, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8644840121269226, + "num_tokens": 167149418.0, + "step": 4584 + }, + { + "epoch": 0.851439182915506, + "grad_norm": 1.6985633373260498, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8801738023757935, + "num_tokens": 167178988.0, + "step": 4585 + }, + { + "epoch": 0.8516248839368616, + "grad_norm": 1.3988741636276245, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8841397762298584, + "num_tokens": 167220861.0, + "step": 4586 + }, + { + "epoch": 0.8518105849582173, + "grad_norm": 1.812739372253418, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8631751537322998, + "num_tokens": 167249344.0, + "step": 4587 + }, + { + "epoch": 0.8519962859795729, + "grad_norm": 1.5029876232147217, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.861315131187439, + "num_tokens": 167285171.0, + "step": 4588 + }, + { + "epoch": 0.8521819870009285, + "grad_norm": 1.5547672510147095, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8655065298080444, + "num_tokens": 167319411.0, + "step": 4589 + }, + { + "epoch": 0.8523676880222841, + "grad_norm": 1.503880262374878, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8783277273178101, + "num_tokens": 167351554.0, + "step": 4590 + }, + { + "epoch": 0.8525533890436398, + "grad_norm": 1.5710959434509277, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.853004515171051, + "num_tokens": 167392096.0, + "step": 4591 + }, + { + "epoch": 0.8527390900649954, + "grad_norm": 1.5462820529937744, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8760678172111511, + "num_tokens": 167424363.0, + "step": 4592 + }, + { + "epoch": 0.852924791086351, + "grad_norm": 1.518629550933838, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8449490666389465, + "num_tokens": 167468203.0, + "step": 4593 + }, + { + "epoch": 0.8531104921077066, + "grad_norm": 1.376489520072937, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8844159245491028, + "num_tokens": 167504292.0, + "step": 4594 + }, + { + "epoch": 0.8532961931290622, + "grad_norm": 1.6840335130691528, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8687881231307983, + "num_tokens": 167538150.0, + "step": 4595 + }, + { + "epoch": 0.8534818941504179, + "grad_norm": 1.4405666589736938, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8671614527702332, + "num_tokens": 167581516.0, + "step": 4596 + }, + { + "epoch": 0.8536675951717735, + "grad_norm": 1.6372188329696655, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8859312534332275, + "num_tokens": 167609112.0, + "step": 4597 + }, + { + "epoch": 0.8538532961931291, + "grad_norm": 1.5280500650405884, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8829644918441772, + "num_tokens": 167639964.0, + "step": 4598 + }, + { + "epoch": 0.8540389972144847, + "grad_norm": 1.5479097366333008, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8715725541114807, + "num_tokens": 167674254.0, + "step": 4599 + }, + { + "epoch": 0.8542246982358404, + "grad_norm": 1.5059409141540527, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.875754177570343, + "num_tokens": 167714014.0, + "step": 4600 + }, + { + "epoch": 0.8544103992571959, + "grad_norm": 1.5416094064712524, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8691321015357971, + "num_tokens": 167751212.0, + "step": 4601 + }, + { + "epoch": 0.8545961002785515, + "grad_norm": 1.5399144887924194, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8760546445846558, + "num_tokens": 167786321.0, + "step": 4602 + }, + { + "epoch": 0.8547818012999071, + "grad_norm": 1.5597171783447266, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.880058765411377, + "num_tokens": 167821185.0, + "step": 4603 + }, + { + "epoch": 0.8549675023212627, + "grad_norm": 1.4437047243118286, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8749229907989502, + "num_tokens": 167860992.0, + "step": 4604 + }, + { + "epoch": 0.8551532033426184, + "grad_norm": 1.4880354404449463, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8690677881240845, + "num_tokens": 167898548.0, + "step": 4605 + }, + { + "epoch": 0.855338904363974, + "grad_norm": 1.4337087869644165, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8708745241165161, + "num_tokens": 167940072.0, + "step": 4606 + }, + { + "epoch": 0.8555246053853296, + "grad_norm": 1.5574508905410767, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8800720572471619, + "num_tokens": 167970988.0, + "step": 4607 + }, + { + "epoch": 0.8557103064066852, + "grad_norm": 1.4351789951324463, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8648043870925903, + "num_tokens": 168008199.0, + "step": 4608 + }, + { + "epoch": 0.8558960074280408, + "grad_norm": 1.7549338340759277, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8725107908248901, + "num_tokens": 168035132.0, + "step": 4609 + }, + { + "epoch": 0.8560817084493965, + "grad_norm": 1.6652926206588745, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8684492707252502, + "num_tokens": 168067188.0, + "step": 4610 + }, + { + "epoch": 0.8562674094707521, + "grad_norm": 1.5287314653396606, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8657534122467041, + "num_tokens": 168105037.0, + "step": 4611 + }, + { + "epoch": 0.8564531104921077, + "grad_norm": 1.6678974628448486, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8677870035171509, + "num_tokens": 168136923.0, + "step": 4612 + }, + { + "epoch": 0.8566388115134633, + "grad_norm": 1.5657585859298706, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8701752424240112, + "num_tokens": 168172931.0, + "step": 4613 + }, + { + "epoch": 0.856824512534819, + "grad_norm": 1.4899688959121704, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8542127013206482, + "num_tokens": 168211660.0, + "step": 4614 + }, + { + "epoch": 0.8570102135561746, + "grad_norm": 1.448712706565857, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8566585779190063, + "num_tokens": 168253048.0, + "step": 4615 + }, + { + "epoch": 0.8571959145775302, + "grad_norm": 1.531308650970459, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8655391335487366, + "num_tokens": 168288462.0, + "step": 4616 + }, + { + "epoch": 0.8573816155988858, + "grad_norm": 2.3130812644958496, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8682385683059692, + "num_tokens": 168318165.0, + "step": 4617 + }, + { + "epoch": 0.8575673166202414, + "grad_norm": 1.547093391418457, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.856842041015625, + "num_tokens": 168354030.0, + "step": 4618 + }, + { + "epoch": 0.8577530176415971, + "grad_norm": 1.510529637336731, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8592318296432495, + "num_tokens": 168394300.0, + "step": 4619 + }, + { + "epoch": 0.8579387186629527, + "grad_norm": 1.4495078325271606, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8605313301086426, + "num_tokens": 168437180.0, + "step": 4620 + }, + { + "epoch": 0.8581244196843083, + "grad_norm": 1.5158843994140625, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8703732490539551, + "num_tokens": 168472882.0, + "step": 4621 + }, + { + "epoch": 0.8583101207056639, + "grad_norm": 1.505348563194275, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8745635747909546, + "num_tokens": 168510496.0, + "step": 4622 + }, + { + "epoch": 0.8584958217270195, + "grad_norm": 1.8174656629562378, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8777536153793335, + "num_tokens": 168535987.0, + "step": 4623 + }, + { + "epoch": 0.8586815227483752, + "grad_norm": 1.4459409713745117, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8814866542816162, + "num_tokens": 168571545.0, + "step": 4624 + }, + { + "epoch": 0.8588672237697307, + "grad_norm": 1.5827070474624634, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8645370006561279, + "num_tokens": 168606182.0, + "step": 4625 + }, + { + "epoch": 0.8590529247910863, + "grad_norm": 1.613558053970337, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8594557642936707, + "num_tokens": 168638833.0, + "step": 4626 + }, + { + "epoch": 0.8592386258124419, + "grad_norm": 1.5170990228652954, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.880609929561615, + "num_tokens": 168672070.0, + "step": 4627 + }, + { + "epoch": 0.8594243268337975, + "grad_norm": 1.5505980253219604, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8738296031951904, + "num_tokens": 168705772.0, + "step": 4628 + }, + { + "epoch": 0.8596100278551532, + "grad_norm": 1.5936895608901978, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8752022981643677, + "num_tokens": 168740642.0, + "step": 4629 + }, + { + "epoch": 0.8597957288765088, + "grad_norm": 1.7106153964996338, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8586126565933228, + "num_tokens": 168774313.0, + "step": 4630 + }, + { + "epoch": 0.8599814298978644, + "grad_norm": 1.512581467628479, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8809616565704346, + "num_tokens": 168811065.0, + "step": 4631 + }, + { + "epoch": 0.86016713091922, + "grad_norm": 1.6532390117645264, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8588896989822388, + "num_tokens": 168843912.0, + "step": 4632 + }, + { + "epoch": 0.8603528319405757, + "grad_norm": 1.399904727935791, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8714522123336792, + "num_tokens": 168880378.0, + "step": 4633 + }, + { + "epoch": 0.8605385329619313, + "grad_norm": 1.5130397081375122, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8713667392730713, + "num_tokens": 168915942.0, + "step": 4634 + }, + { + "epoch": 0.8607242339832869, + "grad_norm": 1.7778342962265015, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.861963152885437, + "num_tokens": 168947350.0, + "step": 4635 + }, + { + "epoch": 0.8609099350046425, + "grad_norm": 1.5039277076721191, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8752097487449646, + "num_tokens": 168982419.0, + "step": 4636 + }, + { + "epoch": 0.8610956360259981, + "grad_norm": 1.5710885524749756, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.865981936454773, + "num_tokens": 169016784.0, + "step": 4637 + }, + { + "epoch": 0.8612813370473538, + "grad_norm": 1.4675884246826172, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8672704696655273, + "num_tokens": 169055863.0, + "step": 4638 + }, + { + "epoch": 0.8614670380687094, + "grad_norm": 1.5293524265289307, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8564959168434143, + "num_tokens": 169092002.0, + "step": 4639 + }, + { + "epoch": 0.861652739090065, + "grad_norm": 1.5273324251174927, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8820197582244873, + "num_tokens": 169127636.0, + "step": 4640 + }, + { + "epoch": 0.8618384401114206, + "grad_norm": 1.5698291063308716, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8554199934005737, + "num_tokens": 169166513.0, + "step": 4641 + }, + { + "epoch": 0.8620241411327763, + "grad_norm": 1.4401519298553467, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8677647709846497, + "num_tokens": 169210237.0, + "step": 4642 + }, + { + "epoch": 0.8622098421541319, + "grad_norm": 1.5401690006256104, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8590166568756104, + "num_tokens": 169246254.0, + "step": 4643 + }, + { + "epoch": 0.8623955431754875, + "grad_norm": 1.462477445602417, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8734873533248901, + "num_tokens": 169279729.0, + "step": 4644 + }, + { + "epoch": 0.8625812441968431, + "grad_norm": 1.442830204963684, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8654826879501343, + "num_tokens": 169322077.0, + "step": 4645 + }, + { + "epoch": 0.8627669452181987, + "grad_norm": 1.4134466648101807, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8799436092376709, + "num_tokens": 169365487.0, + "step": 4646 + }, + { + "epoch": 0.8629526462395544, + "grad_norm": 1.5315591096878052, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8716350793838501, + "num_tokens": 169397819.0, + "step": 4647 + }, + { + "epoch": 0.86313834726091, + "grad_norm": 1.5428690910339355, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.861470639705658, + "num_tokens": 169432657.0, + "step": 4648 + }, + { + "epoch": 0.8633240482822655, + "grad_norm": 1.5128086805343628, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8726686239242554, + "num_tokens": 169468580.0, + "step": 4649 + }, + { + "epoch": 0.8635097493036211, + "grad_norm": 1.4674923419952393, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8649144172668457, + "num_tokens": 169509178.0, + "step": 4650 + }, + { + "epoch": 0.8636954503249767, + "grad_norm": 1.5755641460418701, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8729798793792725, + "num_tokens": 169544373.0, + "step": 4651 + }, + { + "epoch": 0.8638811513463324, + "grad_norm": 1.5014212131500244, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8780199289321899, + "num_tokens": 169581917.0, + "step": 4652 + }, + { + "epoch": 0.864066852367688, + "grad_norm": 1.496617078781128, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8642439842224121, + "num_tokens": 169618628.0, + "step": 4653 + }, + { + "epoch": 0.8642525533890436, + "grad_norm": 1.6758596897125244, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8652294874191284, + "num_tokens": 169650805.0, + "step": 4654 + }, + { + "epoch": 0.8644382544103992, + "grad_norm": 1.571197509765625, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8755605220794678, + "num_tokens": 169685856.0, + "step": 4655 + }, + { + "epoch": 0.8646239554317549, + "grad_norm": 1.483928918838501, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8611924052238464, + "num_tokens": 169725413.0, + "step": 4656 + }, + { + "epoch": 0.8648096564531105, + "grad_norm": 1.5450611114501953, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8751785159111023, + "num_tokens": 169760423.0, + "step": 4657 + }, + { + "epoch": 0.8649953574744661, + "grad_norm": 1.5186669826507568, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8715247511863708, + "num_tokens": 169798316.0, + "step": 4658 + }, + { + "epoch": 0.8651810584958217, + "grad_norm": 1.5741472244262695, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8582373261451721, + "num_tokens": 169835062.0, + "step": 4659 + }, + { + "epoch": 0.8653667595171773, + "grad_norm": 1.4790252447128296, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8672806620597839, + "num_tokens": 169873337.0, + "step": 4660 + }, + { + "epoch": 0.865552460538533, + "grad_norm": 1.3848135471343994, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8729155659675598, + "num_tokens": 169913263.0, + "step": 4661 + }, + { + "epoch": 0.8657381615598886, + "grad_norm": 1.4231126308441162, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8673828840255737, + "num_tokens": 169952526.0, + "step": 4662 + }, + { + "epoch": 0.8659238625812442, + "grad_norm": 1.6702758073806763, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8627780079841614, + "num_tokens": 169989018.0, + "step": 4663 + }, + { + "epoch": 0.8661095636025998, + "grad_norm": 1.5883700847625732, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8559358716011047, + "num_tokens": 170023665.0, + "step": 4664 + }, + { + "epoch": 0.8662952646239555, + "grad_norm": 1.5042279958724976, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.86826092004776, + "num_tokens": 170064501.0, + "step": 4665 + }, + { + "epoch": 0.8664809656453111, + "grad_norm": 1.3973942995071411, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8856373429298401, + "num_tokens": 170100697.0, + "step": 4666 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.3824024200439453, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8713801503181458, + "num_tokens": 170141595.0, + "step": 4667 + }, + { + "epoch": 0.8668523676880223, + "grad_norm": 1.3125479221343994, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.880823016166687, + "num_tokens": 170181158.0, + "step": 4668 + }, + { + "epoch": 0.8670380687093779, + "grad_norm": 1.4866195917129517, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8728560209274292, + "num_tokens": 170215354.0, + "step": 4669 + }, + { + "epoch": 0.8672237697307336, + "grad_norm": 1.4367927312850952, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8827714920043945, + "num_tokens": 170250479.0, + "step": 4670 + }, + { + "epoch": 0.8674094707520892, + "grad_norm": 1.4183112382888794, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8693511486053467, + "num_tokens": 170287035.0, + "step": 4671 + }, + { + "epoch": 0.8675951717734448, + "grad_norm": 1.7083196640014648, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8746742010116577, + "num_tokens": 170319246.0, + "step": 4672 + }, + { + "epoch": 0.8677808727948003, + "grad_norm": 1.373255968093872, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8659840822219849, + "num_tokens": 170359846.0, + "step": 4673 + }, + { + "epoch": 0.8679665738161559, + "grad_norm": 1.5464709997177124, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8555712699890137, + "num_tokens": 170393796.0, + "step": 4674 + }, + { + "epoch": 0.8681522748375116, + "grad_norm": 1.4551215171813965, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8614854216575623, + "num_tokens": 170430782.0, + "step": 4675 + }, + { + "epoch": 0.8683379758588672, + "grad_norm": 1.5796504020690918, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.859893262386322, + "num_tokens": 170463600.0, + "step": 4676 + }, + { + "epoch": 0.8685236768802228, + "grad_norm": 1.5327650308609009, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8686741590499878, + "num_tokens": 170499385.0, + "step": 4677 + }, + { + "epoch": 0.8687093779015784, + "grad_norm": 1.573984980583191, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8705070614814758, + "num_tokens": 170532171.0, + "step": 4678 + }, + { + "epoch": 0.868895078922934, + "grad_norm": 1.6661144495010376, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8689664602279663, + "num_tokens": 170562235.0, + "step": 4679 + }, + { + "epoch": 0.8690807799442897, + "grad_norm": 1.5307848453521729, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8528867959976196, + "num_tokens": 170604060.0, + "step": 4680 + }, + { + "epoch": 0.8692664809656453, + "grad_norm": 1.5886479616165161, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8500882983207703, + "num_tokens": 170640751.0, + "step": 4681 + }, + { + "epoch": 0.8694521819870009, + "grad_norm": 1.6094093322753906, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.858597457408905, + "num_tokens": 170677140.0, + "step": 4682 + }, + { + "epoch": 0.8696378830083565, + "grad_norm": 1.7330108880996704, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8718058466911316, + "num_tokens": 170705104.0, + "step": 4683 + }, + { + "epoch": 0.8698235840297122, + "grad_norm": 1.398701548576355, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8776187896728516, + "num_tokens": 170747608.0, + "step": 4684 + }, + { + "epoch": 0.8700092850510678, + "grad_norm": 1.4242781400680542, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.861780047416687, + "num_tokens": 170787855.0, + "step": 4685 + }, + { + "epoch": 0.8701949860724234, + "grad_norm": 1.4599180221557617, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8706879019737244, + "num_tokens": 170828237.0, + "step": 4686 + }, + { + "epoch": 0.870380687093779, + "grad_norm": 1.4939237833023071, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8669052124023438, + "num_tokens": 170867828.0, + "step": 4687 + }, + { + "epoch": 0.8705663881151346, + "grad_norm": 1.533551812171936, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8672705292701721, + "num_tokens": 170903690.0, + "step": 4688 + }, + { + "epoch": 0.8707520891364903, + "grad_norm": 1.5047247409820557, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8781604766845703, + "num_tokens": 170940264.0, + "step": 4689 + }, + { + "epoch": 0.8709377901578459, + "grad_norm": 1.401591420173645, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8774580359458923, + "num_tokens": 170978285.0, + "step": 4690 + }, + { + "epoch": 0.8711234911792015, + "grad_norm": 1.7133934497833252, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8699389100074768, + "num_tokens": 171007097.0, + "step": 4691 + }, + { + "epoch": 0.8713091922005571, + "grad_norm": 1.4945448637008667, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8674018383026123, + "num_tokens": 171043286.0, + "step": 4692 + }, + { + "epoch": 0.8714948932219128, + "grad_norm": 1.6461101770401, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8585113286972046, + "num_tokens": 171076629.0, + "step": 4693 + }, + { + "epoch": 0.8716805942432684, + "grad_norm": 1.6855814456939697, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8589849472045898, + "num_tokens": 171108266.0, + "step": 4694 + }, + { + "epoch": 0.871866295264624, + "grad_norm": 1.386760950088501, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8748806715011597, + "num_tokens": 171150353.0, + "step": 4695 + }, + { + "epoch": 0.8720519962859796, + "grad_norm": 1.4840582609176636, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8621474504470825, + "num_tokens": 171189764.0, + "step": 4696 + }, + { + "epoch": 0.8722376973073351, + "grad_norm": 1.4826338291168213, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.883013904094696, + "num_tokens": 171228831.0, + "step": 4697 + }, + { + "epoch": 0.8724233983286908, + "grad_norm": 1.4938617944717407, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8707179427146912, + "num_tokens": 171264806.0, + "step": 4698 + }, + { + "epoch": 0.8726090993500464, + "grad_norm": 1.5183169841766357, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8630329966545105, + "num_tokens": 171305656.0, + "step": 4699 + }, + { + "epoch": 0.872794800371402, + "grad_norm": 1.7230478525161743, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8598129153251648, + "num_tokens": 171336728.0, + "step": 4700 + }, + { + "epoch": 0.8729805013927576, + "grad_norm": 1.4348711967468262, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8725377917289734, + "num_tokens": 171374719.0, + "step": 4701 + }, + { + "epoch": 0.8731662024141132, + "grad_norm": 1.4819645881652832, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8657326698303223, + "num_tokens": 171411920.0, + "step": 4702 + }, + { + "epoch": 0.8733519034354689, + "grad_norm": 1.517159104347229, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8635586500167847, + "num_tokens": 171447051.0, + "step": 4703 + }, + { + "epoch": 0.8735376044568245, + "grad_norm": 1.573051929473877, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8600850105285645, + "num_tokens": 171479229.0, + "step": 4704 + }, + { + "epoch": 0.8737233054781801, + "grad_norm": 1.712907314300537, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8483738899230957, + "num_tokens": 171515135.0, + "step": 4705 + }, + { + "epoch": 0.8739090064995357, + "grad_norm": 1.5325547456741333, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8634847402572632, + "num_tokens": 171552975.0, + "step": 4706 + }, + { + "epoch": 0.8740947075208914, + "grad_norm": 1.5237622261047363, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8835854530334473, + "num_tokens": 171587854.0, + "step": 4707 + }, + { + "epoch": 0.874280408542247, + "grad_norm": 1.3989390134811401, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.870110034942627, + "num_tokens": 171630364.0, + "step": 4708 + }, + { + "epoch": 0.8744661095636026, + "grad_norm": 1.3879553079605103, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8665658831596375, + "num_tokens": 171672309.0, + "step": 4709 + }, + { + "epoch": 0.8746518105849582, + "grad_norm": 1.3232115507125854, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8859837055206299, + "num_tokens": 171712104.0, + "step": 4710 + }, + { + "epoch": 0.8748375116063138, + "grad_norm": 1.40411376953125, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8662800788879395, + "num_tokens": 171754010.0, + "step": 4711 + }, + { + "epoch": 0.8750232126276695, + "grad_norm": 1.458554983139038, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8588190078735352, + "num_tokens": 171794634.0, + "step": 4712 + }, + { + "epoch": 0.8752089136490251, + "grad_norm": 1.4380770921707153, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8644305467605591, + "num_tokens": 171834715.0, + "step": 4713 + }, + { + "epoch": 0.8753946146703807, + "grad_norm": 1.4819080829620361, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8742372393608093, + "num_tokens": 171869680.0, + "step": 4714 + }, + { + "epoch": 0.8755803156917363, + "grad_norm": 1.5562236309051514, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8773919343948364, + "num_tokens": 171906727.0, + "step": 4715 + }, + { + "epoch": 0.875766016713092, + "grad_norm": 1.4784799814224243, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8754821419715881, + "num_tokens": 171942192.0, + "step": 4716 + }, + { + "epoch": 0.8759517177344476, + "grad_norm": 1.3962608575820923, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8606811761856079, + "num_tokens": 171989275.0, + "step": 4717 + }, + { + "epoch": 0.8761374187558032, + "grad_norm": 1.4566367864608765, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8810434341430664, + "num_tokens": 172026195.0, + "step": 4718 + }, + { + "epoch": 0.8763231197771588, + "grad_norm": 1.4816482067108154, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8564372062683105, + "num_tokens": 172062343.0, + "step": 4719 + }, + { + "epoch": 0.8765088207985144, + "grad_norm": 1.578900694847107, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8766374588012695, + "num_tokens": 172096800.0, + "step": 4720 + }, + { + "epoch": 0.87669452181987, + "grad_norm": 1.7306568622589111, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8672099709510803, + "num_tokens": 172124257.0, + "step": 4721 + }, + { + "epoch": 0.8768802228412256, + "grad_norm": 1.5773769617080688, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8694763779640198, + "num_tokens": 172157030.0, + "step": 4722 + }, + { + "epoch": 0.8770659238625812, + "grad_norm": 1.5026533603668213, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8659821152687073, + "num_tokens": 172192457.0, + "step": 4723 + }, + { + "epoch": 0.8772516248839368, + "grad_norm": 1.3477894067764282, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8939396142959595, + "num_tokens": 172230981.0, + "step": 4724 + }, + { + "epoch": 0.8774373259052924, + "grad_norm": 1.3885799646377563, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8722763061523438, + "num_tokens": 172270625.0, + "step": 4725 + }, + { + "epoch": 0.8776230269266481, + "grad_norm": 1.3975657224655151, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8778231143951416, + "num_tokens": 172307494.0, + "step": 4726 + }, + { + "epoch": 0.8778087279480037, + "grad_norm": 1.4679949283599854, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8798247575759888, + "num_tokens": 172341310.0, + "step": 4727 + }, + { + "epoch": 0.8779944289693593, + "grad_norm": 1.5672845840454102, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8764689564704895, + "num_tokens": 172377085.0, + "step": 4728 + }, + { + "epoch": 0.8781801299907149, + "grad_norm": 1.4486807584762573, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8799347877502441, + "num_tokens": 172411260.0, + "step": 4729 + }, + { + "epoch": 0.8783658310120706, + "grad_norm": 1.565251350402832, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8729692697525024, + "num_tokens": 172446926.0, + "step": 4730 + }, + { + "epoch": 0.8785515320334262, + "grad_norm": 1.4256033897399902, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8798828125, + "num_tokens": 172489651.0, + "step": 4731 + }, + { + "epoch": 0.8787372330547818, + "grad_norm": 1.5539824962615967, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8676778078079224, + "num_tokens": 172529886.0, + "step": 4732 + }, + { + "epoch": 0.8789229340761374, + "grad_norm": 1.4085739850997925, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8701926469802856, + "num_tokens": 172575648.0, + "step": 4733 + }, + { + "epoch": 0.879108635097493, + "grad_norm": 1.556817650794983, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.859948456287384, + "num_tokens": 172613563.0, + "step": 4734 + }, + { + "epoch": 0.8792943361188487, + "grad_norm": 1.5891273021697998, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8824359178543091, + "num_tokens": 172647125.0, + "step": 4735 + }, + { + "epoch": 0.8794800371402043, + "grad_norm": 1.5743212699890137, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8559761047363281, + "num_tokens": 172681469.0, + "step": 4736 + }, + { + "epoch": 0.8796657381615599, + "grad_norm": 1.4975814819335938, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8591499328613281, + "num_tokens": 172717864.0, + "step": 4737 + }, + { + "epoch": 0.8798514391829155, + "grad_norm": 1.5213475227355957, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8657033443450928, + "num_tokens": 172755387.0, + "step": 4738 + }, + { + "epoch": 0.8800371402042712, + "grad_norm": 1.592025876045227, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8723049759864807, + "num_tokens": 172786819.0, + "step": 4739 + }, + { + "epoch": 0.8802228412256268, + "grad_norm": 1.4743361473083496, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8799914121627808, + "num_tokens": 172826631.0, + "step": 4740 + }, + { + "epoch": 0.8804085422469824, + "grad_norm": 1.49327552318573, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8691948652267456, + "num_tokens": 172862766.0, + "step": 4741 + }, + { + "epoch": 0.880594243268338, + "grad_norm": 1.3680270910263062, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.873794674873352, + "num_tokens": 172905060.0, + "step": 4742 + }, + { + "epoch": 0.8807799442896936, + "grad_norm": 1.6133594512939453, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8672734498977661, + "num_tokens": 172939775.0, + "step": 4743 + }, + { + "epoch": 0.8809656453110493, + "grad_norm": 1.591853380203247, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8485051989555359, + "num_tokens": 172982821.0, + "step": 4744 + }, + { + "epoch": 0.8811513463324049, + "grad_norm": 1.6433351039886475, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8648650050163269, + "num_tokens": 173012270.0, + "step": 4745 + }, + { + "epoch": 0.8813370473537604, + "grad_norm": 1.5079426765441895, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8713281154632568, + "num_tokens": 173047346.0, + "step": 4746 + }, + { + "epoch": 0.881522748375116, + "grad_norm": 1.6254816055297852, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.869652271270752, + "num_tokens": 173078996.0, + "step": 4747 + }, + { + "epoch": 0.8817084493964716, + "grad_norm": 1.6793075799942017, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8839142322540283, + "num_tokens": 173108857.0, + "step": 4748 + }, + { + "epoch": 0.8818941504178273, + "grad_norm": 1.4306113719940186, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8802083730697632, + "num_tokens": 173144200.0, + "step": 4749 + }, + { + "epoch": 0.8820798514391829, + "grad_norm": 1.6000008583068848, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8635052442550659, + "num_tokens": 173177523.0, + "step": 4750 + }, + { + "epoch": 0.8822655524605385, + "grad_norm": 1.3769516944885254, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8805987238883972, + "num_tokens": 173219990.0, + "step": 4751 + }, + { + "epoch": 0.8824512534818941, + "grad_norm": 1.5898317098617554, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8660291433334351, + "num_tokens": 173256962.0, + "step": 4752 + }, + { + "epoch": 0.8826369545032497, + "grad_norm": 1.4905627965927124, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.866807222366333, + "num_tokens": 173296110.0, + "step": 4753 + }, + { + "epoch": 0.8828226555246054, + "grad_norm": 1.5782335996627808, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8527398109436035, + "num_tokens": 173331868.0, + "step": 4754 + }, + { + "epoch": 0.883008356545961, + "grad_norm": 1.5874372720718384, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8783023357391357, + "num_tokens": 173371487.0, + "step": 4755 + }, + { + "epoch": 0.8831940575673166, + "grad_norm": 1.4339429140090942, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8504313826560974, + "num_tokens": 173417080.0, + "step": 4756 + }, + { + "epoch": 0.8833797585886722, + "grad_norm": 1.5190860033035278, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8641281127929688, + "num_tokens": 173451992.0, + "step": 4757 + }, + { + "epoch": 0.8835654596100279, + "grad_norm": 1.5281329154968262, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8760406970977783, + "num_tokens": 173490048.0, + "step": 4758 + }, + { + "epoch": 0.8837511606313835, + "grad_norm": 1.5381414890289307, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8707340955734253, + "num_tokens": 173525155.0, + "step": 4759 + }, + { + "epoch": 0.8839368616527391, + "grad_norm": 1.5149837732315063, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8628530502319336, + "num_tokens": 173564690.0, + "step": 4760 + }, + { + "epoch": 0.8841225626740947, + "grad_norm": 1.3811296224594116, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8809975385665894, + "num_tokens": 173604246.0, + "step": 4761 + }, + { + "epoch": 0.8843082636954503, + "grad_norm": 1.5777345895767212, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8748582601547241, + "num_tokens": 173638178.0, + "step": 4762 + }, + { + "epoch": 0.884493964716806, + "grad_norm": 1.458432674407959, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8743156790733337, + "num_tokens": 173677137.0, + "step": 4763 + }, + { + "epoch": 0.8846796657381616, + "grad_norm": 1.5294910669326782, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8783432841300964, + "num_tokens": 173712480.0, + "step": 4764 + }, + { + "epoch": 0.8848653667595172, + "grad_norm": 1.6358133554458618, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8712069392204285, + "num_tokens": 173744998.0, + "step": 4765 + }, + { + "epoch": 0.8850510677808728, + "grad_norm": 1.4685479402542114, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8617575168609619, + "num_tokens": 173785009.0, + "step": 4766 + }, + { + "epoch": 0.8852367688022285, + "grad_norm": 1.4675904512405396, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8544636964797974, + "num_tokens": 173822987.0, + "step": 4767 + }, + { + "epoch": 0.8854224698235841, + "grad_norm": 1.6066417694091797, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8752797842025757, + "num_tokens": 173854081.0, + "step": 4768 + }, + { + "epoch": 0.8856081708449397, + "grad_norm": 1.500015139579773, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8705971837043762, + "num_tokens": 173892947.0, + "step": 4769 + }, + { + "epoch": 0.8857938718662952, + "grad_norm": 1.545113205909729, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8808876276016235, + "num_tokens": 173925423.0, + "step": 4770 + }, + { + "epoch": 0.8859795728876508, + "grad_norm": 1.4811888933181763, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8868309259414673, + "num_tokens": 173958773.0, + "step": 4771 + }, + { + "epoch": 0.8861652739090065, + "grad_norm": 1.6322734355926514, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8635712265968323, + "num_tokens": 173995072.0, + "step": 4772 + }, + { + "epoch": 0.8863509749303621, + "grad_norm": 1.3862321376800537, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8729954957962036, + "num_tokens": 174037843.0, + "step": 4773 + }, + { + "epoch": 0.8865366759517177, + "grad_norm": 1.6136388778686523, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8669060468673706, + "num_tokens": 174076324.0, + "step": 4774 + }, + { + "epoch": 0.8867223769730733, + "grad_norm": 1.3243048191070557, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8919103145599365, + "num_tokens": 174117822.0, + "step": 4775 + }, + { + "epoch": 0.886908077994429, + "grad_norm": 1.7385878562927246, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8460500240325928, + "num_tokens": 174151358.0, + "step": 4776 + }, + { + "epoch": 0.8870937790157846, + "grad_norm": 1.4254615306854248, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8797520399093628, + "num_tokens": 174188350.0, + "step": 4777 + }, + { + "epoch": 0.8872794800371402, + "grad_norm": 1.6085233688354492, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.870908796787262, + "num_tokens": 174224444.0, + "step": 4778 + }, + { + "epoch": 0.8874651810584958, + "grad_norm": 1.5298967361450195, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8679109811782837, + "num_tokens": 174259842.0, + "step": 4779 + }, + { + "epoch": 0.8876508820798514, + "grad_norm": 1.4865880012512207, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8686400055885315, + "num_tokens": 174299344.0, + "step": 4780 + }, + { + "epoch": 0.887836583101207, + "grad_norm": 1.5548306703567505, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8653576374053955, + "num_tokens": 174335823.0, + "step": 4781 + }, + { + "epoch": 0.8880222841225627, + "grad_norm": 1.6234617233276367, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8738052845001221, + "num_tokens": 174368678.0, + "step": 4782 + }, + { + "epoch": 0.8882079851439183, + "grad_norm": 1.6277137994766235, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8771665096282959, + "num_tokens": 174400383.0, + "step": 4783 + }, + { + "epoch": 0.8883936861652739, + "grad_norm": 1.5154774188995361, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8758110404014587, + "num_tokens": 174433322.0, + "step": 4784 + }, + { + "epoch": 0.8885793871866295, + "grad_norm": 1.8026305437088013, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8686245083808899, + "num_tokens": 174461239.0, + "step": 4785 + }, + { + "epoch": 0.8887650882079852, + "grad_norm": 1.4944957494735718, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8604229092597961, + "num_tokens": 174497065.0, + "step": 4786 + }, + { + "epoch": 0.8889507892293408, + "grad_norm": 1.5525662899017334, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8769094944000244, + "num_tokens": 174529676.0, + "step": 4787 + }, + { + "epoch": 0.8891364902506964, + "grad_norm": 1.3200030326843262, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8644375801086426, + "num_tokens": 174578258.0, + "step": 4788 + }, + { + "epoch": 0.889322191272052, + "grad_norm": 1.5202505588531494, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.886444091796875, + "num_tokens": 174616653.0, + "step": 4789 + }, + { + "epoch": 0.8895078922934077, + "grad_norm": 1.408022403717041, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8691788911819458, + "num_tokens": 174657704.0, + "step": 4790 + }, + { + "epoch": 0.8896935933147633, + "grad_norm": 1.443514108657837, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8753986954689026, + "num_tokens": 174695158.0, + "step": 4791 + }, + { + "epoch": 0.8898792943361189, + "grad_norm": 1.4879006147384644, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8807724118232727, + "num_tokens": 174732521.0, + "step": 4792 + }, + { + "epoch": 0.8900649953574745, + "grad_norm": 1.651968240737915, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8580915927886963, + "num_tokens": 174764361.0, + "step": 4793 + }, + { + "epoch": 0.89025069637883, + "grad_norm": 1.4708138704299927, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8665516376495361, + "num_tokens": 174803949.0, + "step": 4794 + }, + { + "epoch": 0.8904363974001857, + "grad_norm": 1.5088175535202026, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8834323883056641, + "num_tokens": 174837474.0, + "step": 4795 + }, + { + "epoch": 0.8906220984215413, + "grad_norm": 1.4842257499694824, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8624548316001892, + "num_tokens": 174875221.0, + "step": 4796 + }, + { + "epoch": 0.8908077994428969, + "grad_norm": 1.4911201000213623, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8772233724594116, + "num_tokens": 174908069.0, + "step": 4797 + }, + { + "epoch": 0.8909935004642525, + "grad_norm": 1.4973647594451904, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8640596866607666, + "num_tokens": 174947009.0, + "step": 4798 + }, + { + "epoch": 0.8911792014856081, + "grad_norm": 1.4751495122909546, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8629128932952881, + "num_tokens": 174982210.0, + "step": 4799 + }, + { + "epoch": 0.8913649025069638, + "grad_norm": 1.7706347703933716, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8601312041282654, + "num_tokens": 175015625.0, + "step": 4800 + }, + { + "epoch": 0.8915506035283194, + "grad_norm": 1.4760522842407227, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8763472437858582, + "num_tokens": 175055238.0, + "step": 4801 + }, + { + "epoch": 0.891736304549675, + "grad_norm": 1.3713289499282837, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8785959482192993, + "num_tokens": 175098320.0, + "step": 4802 + }, + { + "epoch": 0.8919220055710306, + "grad_norm": 1.593049168586731, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8737995624542236, + "num_tokens": 175132999.0, + "step": 4803 + }, + { + "epoch": 0.8921077065923863, + "grad_norm": 1.4506183862686157, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8805062174797058, + "num_tokens": 175172370.0, + "step": 4804 + }, + { + "epoch": 0.8922934076137419, + "grad_norm": 1.674276351928711, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8557941913604736, + "num_tokens": 175206021.0, + "step": 4805 + }, + { + "epoch": 0.8924791086350975, + "grad_norm": 1.379746913909912, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8775980472564697, + "num_tokens": 175248015.0, + "step": 4806 + }, + { + "epoch": 0.8926648096564531, + "grad_norm": 1.5328634977340698, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8710126876831055, + "num_tokens": 175282056.0, + "step": 4807 + }, + { + "epoch": 0.8928505106778087, + "grad_norm": 1.4515841007232666, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8611841201782227, + "num_tokens": 175317390.0, + "step": 4808 + }, + { + "epoch": 0.8930362116991644, + "grad_norm": 1.7256273031234741, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8601701855659485, + "num_tokens": 175348735.0, + "step": 4809 + }, + { + "epoch": 0.89322191272052, + "grad_norm": 1.3768936395645142, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8673385977745056, + "num_tokens": 175391829.0, + "step": 4810 + }, + { + "epoch": 0.8934076137418756, + "grad_norm": 1.5298045873641968, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8700343370437622, + "num_tokens": 175424044.0, + "step": 4811 + }, + { + "epoch": 0.8935933147632312, + "grad_norm": 1.4424234628677368, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8669499158859253, + "num_tokens": 175466848.0, + "step": 4812 + }, + { + "epoch": 0.8937790157845868, + "grad_norm": 1.3982256650924683, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8615391254425049, + "num_tokens": 175511433.0, + "step": 4813 + }, + { + "epoch": 0.8939647168059425, + "grad_norm": 1.4331828355789185, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8744673132896423, + "num_tokens": 175550404.0, + "step": 4814 + }, + { + "epoch": 0.8941504178272981, + "grad_norm": 1.541285753250122, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8794925212860107, + "num_tokens": 175584503.0, + "step": 4815 + }, + { + "epoch": 0.8943361188486537, + "grad_norm": 1.3801947832107544, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8764438629150391, + "num_tokens": 175624285.0, + "step": 4816 + }, + { + "epoch": 0.8945218198700093, + "grad_norm": 1.4417616128921509, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8742138743400574, + "num_tokens": 175659850.0, + "step": 4817 + }, + { + "epoch": 0.8947075208913648, + "grad_norm": 1.5252212285995483, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8524034023284912, + "num_tokens": 175695754.0, + "step": 4818 + }, + { + "epoch": 0.8948932219127205, + "grad_norm": 1.5184026956558228, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8735013008117676, + "num_tokens": 175736755.0, + "step": 4819 + }, + { + "epoch": 0.8950789229340761, + "grad_norm": 1.5826938152313232, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8647482395172119, + "num_tokens": 175776568.0, + "step": 4820 + }, + { + "epoch": 0.8952646239554317, + "grad_norm": 1.517617106437683, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8777640461921692, + "num_tokens": 175809741.0, + "step": 4821 + }, + { + "epoch": 0.8954503249767873, + "grad_norm": 1.5583610534667969, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8753920793533325, + "num_tokens": 175842632.0, + "step": 4822 + }, + { + "epoch": 0.895636025998143, + "grad_norm": 1.579404354095459, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8617385029792786, + "num_tokens": 175879265.0, + "step": 4823 + }, + { + "epoch": 0.8958217270194986, + "grad_norm": 1.639934778213501, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8607471585273743, + "num_tokens": 175912320.0, + "step": 4824 + }, + { + "epoch": 0.8960074280408542, + "grad_norm": 1.5283830165863037, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8708881735801697, + "num_tokens": 175947699.0, + "step": 4825 + }, + { + "epoch": 0.8961931290622098, + "grad_norm": 1.5984262228012085, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8607003688812256, + "num_tokens": 175982414.0, + "step": 4826 + }, + { + "epoch": 0.8963788300835654, + "grad_norm": 1.4714208841323853, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8737660646438599, + "num_tokens": 176021175.0, + "step": 4827 + }, + { + "epoch": 0.8965645311049211, + "grad_norm": 1.538533329963684, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8624366521835327, + "num_tokens": 176059087.0, + "step": 4828 + }, + { + "epoch": 0.8967502321262767, + "grad_norm": 1.530639886856079, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.870949387550354, + "num_tokens": 176095129.0, + "step": 4829 + }, + { + "epoch": 0.8969359331476323, + "grad_norm": 1.5769990682601929, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8752663135528564, + "num_tokens": 176125777.0, + "step": 4830 + }, + { + "epoch": 0.8971216341689879, + "grad_norm": 1.7162754535675049, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8615942597389221, + "num_tokens": 176156347.0, + "step": 4831 + }, + { + "epoch": 0.8973073351903436, + "grad_norm": 1.5522363185882568, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8677868247032166, + "num_tokens": 176194942.0, + "step": 4832 + }, + { + "epoch": 0.8974930362116992, + "grad_norm": 1.4498323202133179, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.877029299736023, + "num_tokens": 176236229.0, + "step": 4833 + }, + { + "epoch": 0.8976787372330548, + "grad_norm": 1.5008474588394165, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8647205829620361, + "num_tokens": 176274052.0, + "step": 4834 + }, + { + "epoch": 0.8978644382544104, + "grad_norm": 1.3302990198135376, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8796135187149048, + "num_tokens": 176316042.0, + "step": 4835 + }, + { + "epoch": 0.898050139275766, + "grad_norm": 1.4717892408370972, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8693954944610596, + "num_tokens": 176354832.0, + "step": 4836 + }, + { + "epoch": 0.8982358402971217, + "grad_norm": 1.5983099937438965, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8717153072357178, + "num_tokens": 176388810.0, + "step": 4837 + }, + { + "epoch": 0.8984215413184773, + "grad_norm": 1.5340723991394043, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8736028671264648, + "num_tokens": 176426293.0, + "step": 4838 + }, + { + "epoch": 0.8986072423398329, + "grad_norm": 1.5787922143936157, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8539091348648071, + "num_tokens": 176461918.0, + "step": 4839 + }, + { + "epoch": 0.8987929433611885, + "grad_norm": 1.4188742637634277, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8673453330993652, + "num_tokens": 176504111.0, + "step": 4840 + }, + { + "epoch": 0.8989786443825442, + "grad_norm": 1.724025011062622, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.863922119140625, + "num_tokens": 176538353.0, + "step": 4841 + }, + { + "epoch": 0.8991643454038997, + "grad_norm": 1.579163670539856, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8813534379005432, + "num_tokens": 176568488.0, + "step": 4842 + }, + { + "epoch": 0.8993500464252553, + "grad_norm": 1.4006391763687134, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.876522421836853, + "num_tokens": 176608369.0, + "step": 4843 + }, + { + "epoch": 0.8995357474466109, + "grad_norm": 1.6609256267547607, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8706561923027039, + "num_tokens": 176639574.0, + "step": 4844 + }, + { + "epoch": 0.8997214484679665, + "grad_norm": 1.5028482675552368, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8643336296081543, + "num_tokens": 176674605.0, + "step": 4845 + }, + { + "epoch": 0.8999071494893222, + "grad_norm": 1.554011344909668, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8509497046470642, + "num_tokens": 176710748.0, + "step": 4846 + }, + { + "epoch": 0.9000928505106778, + "grad_norm": 1.3985698223114014, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8757951259613037, + "num_tokens": 176751973.0, + "step": 4847 + }, + { + "epoch": 0.9002785515320334, + "grad_norm": 1.6799694299697876, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8671684861183167, + "num_tokens": 176782771.0, + "step": 4848 + }, + { + "epoch": 0.900464252553389, + "grad_norm": 1.4992340803146362, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.87361741065979, + "num_tokens": 176817584.0, + "step": 4849 + }, + { + "epoch": 0.9006499535747446, + "grad_norm": 1.4682918787002563, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8781226873397827, + "num_tokens": 176855904.0, + "step": 4850 + }, + { + "epoch": 0.9008356545961003, + "grad_norm": 1.7117559909820557, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.853238046169281, + "num_tokens": 176885719.0, + "step": 4851 + }, + { + "epoch": 0.9010213556174559, + "grad_norm": 1.5860869884490967, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8496460914611816, + "num_tokens": 176924898.0, + "step": 4852 + }, + { + "epoch": 0.9012070566388115, + "grad_norm": 1.40346097946167, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8721129894256592, + "num_tokens": 176966307.0, + "step": 4853 + }, + { + "epoch": 0.9013927576601671, + "grad_norm": 1.6208287477493286, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8572896718978882, + "num_tokens": 176999724.0, + "step": 4854 + }, + { + "epoch": 0.9015784586815228, + "grad_norm": 1.5076602697372437, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8677628040313721, + "num_tokens": 177034926.0, + "step": 4855 + }, + { + "epoch": 0.9017641597028784, + "grad_norm": 1.4744848012924194, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8723377585411072, + "num_tokens": 177074122.0, + "step": 4856 + }, + { + "epoch": 0.901949860724234, + "grad_norm": 1.5814235210418701, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8547263145446777, + "num_tokens": 177108448.0, + "step": 4857 + }, + { + "epoch": 0.9021355617455896, + "grad_norm": 1.3943877220153809, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8584386110305786, + "num_tokens": 177149800.0, + "step": 4858 + }, + { + "epoch": 0.9023212627669452, + "grad_norm": 1.6038357019424438, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.867489218711853, + "num_tokens": 177183642.0, + "step": 4859 + }, + { + "epoch": 0.9025069637883009, + "grad_norm": 1.4313291311264038, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8762636780738831, + "num_tokens": 177221871.0, + "step": 4860 + }, + { + "epoch": 0.9026926648096565, + "grad_norm": 1.536144495010376, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.876983106136322, + "num_tokens": 177256564.0, + "step": 4861 + }, + { + "epoch": 0.9028783658310121, + "grad_norm": 1.481418490409851, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8692121505737305, + "num_tokens": 177293758.0, + "step": 4862 + }, + { + "epoch": 0.9030640668523677, + "grad_norm": 1.4286432266235352, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8694701790809631, + "num_tokens": 177335706.0, + "step": 4863 + }, + { + "epoch": 0.9032497678737234, + "grad_norm": 1.554950475692749, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8583371639251709, + "num_tokens": 177372777.0, + "step": 4864 + }, + { + "epoch": 0.903435468895079, + "grad_norm": 1.359164834022522, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.881635844707489, + "num_tokens": 177411507.0, + "step": 4865 + }, + { + "epoch": 0.9036211699164345, + "grad_norm": 1.5088132619857788, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8635287284851074, + "num_tokens": 177448522.0, + "step": 4866 + }, + { + "epoch": 0.9038068709377901, + "grad_norm": 1.581478476524353, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8533130288124084, + "num_tokens": 177483084.0, + "step": 4867 + }, + { + "epoch": 0.9039925719591457, + "grad_norm": 1.432236909866333, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8798525929450989, + "num_tokens": 177522103.0, + "step": 4868 + }, + { + "epoch": 0.9041782729805014, + "grad_norm": 1.5382437705993652, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8805240988731384, + "num_tokens": 177556410.0, + "step": 4869 + }, + { + "epoch": 0.904363974001857, + "grad_norm": 1.6986415386199951, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8670618534088135, + "num_tokens": 177593418.0, + "step": 4870 + }, + { + "epoch": 0.9045496750232126, + "grad_norm": 1.5055731534957886, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8642240762710571, + "num_tokens": 177633106.0, + "step": 4871 + }, + { + "epoch": 0.9047353760445682, + "grad_norm": 1.5346180200576782, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8654614686965942, + "num_tokens": 177668055.0, + "step": 4872 + }, + { + "epoch": 0.9049210770659238, + "grad_norm": 1.5783189535140991, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.866639256477356, + "num_tokens": 177701534.0, + "step": 4873 + }, + { + "epoch": 0.9051067780872795, + "grad_norm": 1.5914838314056396, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8646079301834106, + "num_tokens": 177735310.0, + "step": 4874 + }, + { + "epoch": 0.9052924791086351, + "grad_norm": 1.3852455615997314, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8653541207313538, + "num_tokens": 177778465.0, + "step": 4875 + }, + { + "epoch": 0.9054781801299907, + "grad_norm": 1.7763937711715698, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8598902225494385, + "num_tokens": 177811507.0, + "step": 4876 + }, + { + "epoch": 0.9056638811513463, + "grad_norm": 1.5820106267929077, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8378258943557739, + "num_tokens": 177850256.0, + "step": 4877 + }, + { + "epoch": 0.905849582172702, + "grad_norm": 1.5655906200408936, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8726187944412231, + "num_tokens": 177883247.0, + "step": 4878 + }, + { + "epoch": 0.9060352831940576, + "grad_norm": 1.4984331130981445, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8750532865524292, + "num_tokens": 177915824.0, + "step": 4879 + }, + { + "epoch": 0.9062209842154132, + "grad_norm": 1.412563681602478, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.860548734664917, + "num_tokens": 177957918.0, + "step": 4880 + }, + { + "epoch": 0.9064066852367688, + "grad_norm": 1.5550636053085327, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8915408849716187, + "num_tokens": 177990052.0, + "step": 4881 + }, + { + "epoch": 0.9065923862581244, + "grad_norm": 1.4762588739395142, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8852425813674927, + "num_tokens": 178026671.0, + "step": 4882 + }, + { + "epoch": 0.9067780872794801, + "grad_norm": 1.4599124193191528, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8668450117111206, + "num_tokens": 178066155.0, + "step": 4883 + }, + { + "epoch": 0.9069637883008357, + "grad_norm": 1.6296485662460327, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8662936687469482, + "num_tokens": 178098248.0, + "step": 4884 + }, + { + "epoch": 0.9071494893221913, + "grad_norm": 1.3818066120147705, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8758168816566467, + "num_tokens": 178140742.0, + "step": 4885 + }, + { + "epoch": 0.9073351903435469, + "grad_norm": 1.6140189170837402, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8686307668685913, + "num_tokens": 178177151.0, + "step": 4886 + }, + { + "epoch": 0.9075208913649025, + "grad_norm": 1.5450433492660522, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8618767261505127, + "num_tokens": 178219427.0, + "step": 4887 + }, + { + "epoch": 0.9077065923862582, + "grad_norm": 1.5990384817123413, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8744580149650574, + "num_tokens": 178253305.0, + "step": 4888 + }, + { + "epoch": 0.9078922934076138, + "grad_norm": 1.4863340854644775, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8690986633300781, + "num_tokens": 178291748.0, + "step": 4889 + }, + { + "epoch": 0.9080779944289693, + "grad_norm": 1.6281179189682007, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8771511316299438, + "num_tokens": 178323787.0, + "step": 4890 + }, + { + "epoch": 0.9082636954503249, + "grad_norm": 1.458060622215271, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8687626719474792, + "num_tokens": 178360410.0, + "step": 4891 + }, + { + "epoch": 0.9084493964716805, + "grad_norm": 1.4679876565933228, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8747929930686951, + "num_tokens": 178396415.0, + "step": 4892 + }, + { + "epoch": 0.9086350974930362, + "grad_norm": 1.4155123233795166, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.872077226638794, + "num_tokens": 178437524.0, + "step": 4893 + }, + { + "epoch": 0.9088207985143918, + "grad_norm": 1.5825475454330444, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8621575236320496, + "num_tokens": 178472912.0, + "step": 4894 + }, + { + "epoch": 0.9090064995357474, + "grad_norm": 1.402733564376831, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8493918776512146, + "num_tokens": 178519155.0, + "step": 4895 + }, + { + "epoch": 0.909192200557103, + "grad_norm": 1.6360282897949219, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8669881820678711, + "num_tokens": 178550735.0, + "step": 4896 + }, + { + "epoch": 0.9093779015784587, + "grad_norm": 1.3904603719711304, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8847911357879639, + "num_tokens": 178588884.0, + "step": 4897 + }, + { + "epoch": 0.9095636025998143, + "grad_norm": 1.3975392580032349, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8795739412307739, + "num_tokens": 178629359.0, + "step": 4898 + }, + { + "epoch": 0.9097493036211699, + "grad_norm": 1.6933811902999878, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8504886031150818, + "num_tokens": 178660788.0, + "step": 4899 + }, + { + "epoch": 0.9099350046425255, + "grad_norm": 1.4468050003051758, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8785355091094971, + "num_tokens": 178696507.0, + "step": 4900 + }, + { + "epoch": 0.9101207056638811, + "grad_norm": 1.4005018472671509, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8740590214729309, + "num_tokens": 178734459.0, + "step": 4901 + }, + { + "epoch": 0.9103064066852368, + "grad_norm": 1.559719443321228, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8717941641807556, + "num_tokens": 178769536.0, + "step": 4902 + }, + { + "epoch": 0.9104921077065924, + "grad_norm": 1.608018398284912, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8551832437515259, + "num_tokens": 178805914.0, + "step": 4903 + }, + { + "epoch": 0.910677808727948, + "grad_norm": 1.5473527908325195, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8640466928482056, + "num_tokens": 178841051.0, + "step": 4904 + }, + { + "epoch": 0.9108635097493036, + "grad_norm": 1.4258087873458862, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8898317813873291, + "num_tokens": 178877339.0, + "step": 4905 + }, + { + "epoch": 0.9110492107706593, + "grad_norm": 1.4509040117263794, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8596097230911255, + "num_tokens": 178919602.0, + "step": 4906 + }, + { + "epoch": 0.9112349117920149, + "grad_norm": 1.4565502405166626, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.872384250164032, + "num_tokens": 178957800.0, + "step": 4907 + }, + { + "epoch": 0.9114206128133705, + "grad_norm": 1.4277825355529785, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8838810920715332, + "num_tokens": 178993314.0, + "step": 4908 + }, + { + "epoch": 0.9116063138347261, + "grad_norm": 1.7102805376052856, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8586053848266602, + "num_tokens": 179023175.0, + "step": 4909 + }, + { + "epoch": 0.9117920148560817, + "grad_norm": 1.539237380027771, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8521353006362915, + "num_tokens": 179061170.0, + "step": 4910 + }, + { + "epoch": 0.9119777158774374, + "grad_norm": 1.4400091171264648, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8590624928474426, + "num_tokens": 179102693.0, + "step": 4911 + }, + { + "epoch": 0.912163416898793, + "grad_norm": 1.4876583814620972, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8656754493713379, + "num_tokens": 179143328.0, + "step": 4912 + }, + { + "epoch": 0.9123491179201486, + "grad_norm": 1.5189622640609741, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8420310020446777, + "num_tokens": 179182823.0, + "step": 4913 + }, + { + "epoch": 0.9125348189415042, + "grad_norm": 1.5804524421691895, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8664284944534302, + "num_tokens": 179214811.0, + "step": 4914 + }, + { + "epoch": 0.9127205199628597, + "grad_norm": 1.5300244092941284, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8702281713485718, + "num_tokens": 179250928.0, + "step": 4915 + }, + { + "epoch": 0.9129062209842154, + "grad_norm": 1.4755769968032837, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8803035616874695, + "num_tokens": 179285156.0, + "step": 4916 + }, + { + "epoch": 0.913091922005571, + "grad_norm": 1.4302093982696533, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8661864995956421, + "num_tokens": 179324152.0, + "step": 4917 + }, + { + "epoch": 0.9132776230269266, + "grad_norm": 1.4996857643127441, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8655185103416443, + "num_tokens": 179359969.0, + "step": 4918 + }, + { + "epoch": 0.9134633240482822, + "grad_norm": 1.619799017906189, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8683581948280334, + "num_tokens": 179394026.0, + "step": 4919 + }, + { + "epoch": 0.9136490250696379, + "grad_norm": 1.4856672286987305, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.877554178237915, + "num_tokens": 179426344.0, + "step": 4920 + }, + { + "epoch": 0.9138347260909935, + "grad_norm": 1.5282678604125977, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8701508045196533, + "num_tokens": 179460073.0, + "step": 4921 + }, + { + "epoch": 0.9140204271123491, + "grad_norm": 1.3898826837539673, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8853692412376404, + "num_tokens": 179499639.0, + "step": 4922 + }, + { + "epoch": 0.9142061281337047, + "grad_norm": 1.59010910987854, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8791880011558533, + "num_tokens": 179530397.0, + "step": 4923 + }, + { + "epoch": 0.9143918291550603, + "grad_norm": 1.4473222494125366, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8669903874397278, + "num_tokens": 179571023.0, + "step": 4924 + }, + { + "epoch": 0.914577530176416, + "grad_norm": 1.4197468757629395, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8635565042495728, + "num_tokens": 179610729.0, + "step": 4925 + }, + { + "epoch": 0.9147632311977716, + "grad_norm": 1.6790530681610107, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.870419979095459, + "num_tokens": 179642469.0, + "step": 4926 + }, + { + "epoch": 0.9149489322191272, + "grad_norm": 1.4577542543411255, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8804590106010437, + "num_tokens": 179680917.0, + "step": 4927 + }, + { + "epoch": 0.9151346332404828, + "grad_norm": 1.4768199920654297, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8678197860717773, + "num_tokens": 179719548.0, + "step": 4928 + }, + { + "epoch": 0.9153203342618385, + "grad_norm": 1.5367822647094727, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.884121298789978, + "num_tokens": 179753940.0, + "step": 4929 + }, + { + "epoch": 0.9155060352831941, + "grad_norm": 1.559828281402588, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8560280799865723, + "num_tokens": 179791118.0, + "step": 4930 + }, + { + "epoch": 0.9156917363045497, + "grad_norm": 1.512104868888855, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8710364699363708, + "num_tokens": 179828686.0, + "step": 4931 + }, + { + "epoch": 0.9158774373259053, + "grad_norm": 1.4096406698226929, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8780494928359985, + "num_tokens": 179869965.0, + "step": 4932 + }, + { + "epoch": 0.9160631383472609, + "grad_norm": 1.6443023681640625, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8789955377578735, + "num_tokens": 179899718.0, + "step": 4933 + }, + { + "epoch": 0.9162488393686166, + "grad_norm": 1.4729139804840088, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8648101687431335, + "num_tokens": 179937945.0, + "step": 4934 + }, + { + "epoch": 0.9164345403899722, + "grad_norm": 1.5503971576690674, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8681146502494812, + "num_tokens": 179971502.0, + "step": 4935 + }, + { + "epoch": 0.9166202414113278, + "grad_norm": 1.45066499710083, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8666458129882812, + "num_tokens": 180009894.0, + "step": 4936 + }, + { + "epoch": 0.9168059424326834, + "grad_norm": 1.49847412109375, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8814886212348938, + "num_tokens": 180042671.0, + "step": 4937 + }, + { + "epoch": 0.916991643454039, + "grad_norm": 1.5254794359207153, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8623909950256348, + "num_tokens": 180079042.0, + "step": 4938 + }, + { + "epoch": 0.9171773444753946, + "grad_norm": 1.4272125959396362, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8801857233047485, + "num_tokens": 180116521.0, + "step": 4939 + }, + { + "epoch": 0.9173630454967502, + "grad_norm": 1.5024996995925903, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8689754009246826, + "num_tokens": 180151417.0, + "step": 4940 + }, + { + "epoch": 0.9175487465181058, + "grad_norm": 1.5576660633087158, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8788912892341614, + "num_tokens": 180183419.0, + "step": 4941 + }, + { + "epoch": 0.9177344475394614, + "grad_norm": 1.528767466545105, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.863616943359375, + "num_tokens": 180223209.0, + "step": 4942 + }, + { + "epoch": 0.917920148560817, + "grad_norm": 1.3936045169830322, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8766278624534607, + "num_tokens": 180260951.0, + "step": 4943 + }, + { + "epoch": 0.9181058495821727, + "grad_norm": 1.5256344079971313, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8574792742729187, + "num_tokens": 180299374.0, + "step": 4944 + }, + { + "epoch": 0.9182915506035283, + "grad_norm": 1.5034233331680298, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8690664172172546, + "num_tokens": 180332186.0, + "step": 4945 + }, + { + "epoch": 0.9184772516248839, + "grad_norm": 1.5824428796768188, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8545546531677246, + "num_tokens": 180369320.0, + "step": 4946 + }, + { + "epoch": 0.9186629526462395, + "grad_norm": 1.4715946912765503, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8667441606521606, + "num_tokens": 180405510.0, + "step": 4947 + }, + { + "epoch": 0.9188486536675952, + "grad_norm": 1.4332823753356934, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8634482622146606, + "num_tokens": 180443973.0, + "step": 4948 + }, + { + "epoch": 0.9190343546889508, + "grad_norm": 1.5377339124679565, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8580104112625122, + "num_tokens": 180482568.0, + "step": 4949 + }, + { + "epoch": 0.9192200557103064, + "grad_norm": 1.6481987237930298, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8738813996315002, + "num_tokens": 180512792.0, + "step": 4950 + }, + { + "epoch": 0.919405756731662, + "grad_norm": 1.4788063764572144, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8691879510879517, + "num_tokens": 180551533.0, + "step": 4951 + }, + { + "epoch": 0.9195914577530176, + "grad_norm": 1.5714253187179565, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8732495307922363, + "num_tokens": 180586792.0, + "step": 4952 + }, + { + "epoch": 0.9197771587743733, + "grad_norm": 1.549198031425476, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8612474203109741, + "num_tokens": 180618944.0, + "step": 4953 + }, + { + "epoch": 0.9199628597957289, + "grad_norm": 1.3580763339996338, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8835310339927673, + "num_tokens": 180660266.0, + "step": 4954 + }, + { + "epoch": 0.9201485608170845, + "grad_norm": 1.4479811191558838, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8800013065338135, + "num_tokens": 180695755.0, + "step": 4955 + }, + { + "epoch": 0.9203342618384401, + "grad_norm": 1.6010870933532715, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8334994912147522, + "num_tokens": 180735583.0, + "step": 4956 + }, + { + "epoch": 0.9205199628597958, + "grad_norm": 1.6605682373046875, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8489639759063721, + "num_tokens": 180769773.0, + "step": 4957 + }, + { + "epoch": 0.9207056638811514, + "grad_norm": 1.555166482925415, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8795621395111084, + "num_tokens": 180805715.0, + "step": 4958 + }, + { + "epoch": 0.920891364902507, + "grad_norm": 1.7382948398590088, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8526263236999512, + "num_tokens": 180836398.0, + "step": 4959 + }, + { + "epoch": 0.9210770659238626, + "grad_norm": 1.8323215246200562, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8449361324310303, + "num_tokens": 180867830.0, + "step": 4960 + }, + { + "epoch": 0.9212627669452182, + "grad_norm": 1.647473931312561, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8670637607574463, + "num_tokens": 180899352.0, + "step": 4961 + }, + { + "epoch": 0.9214484679665739, + "grad_norm": 1.4899559020996094, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8623570799827576, + "num_tokens": 180935290.0, + "step": 4962 + }, + { + "epoch": 0.9216341689879294, + "grad_norm": 1.6257816553115845, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8478410840034485, + "num_tokens": 180972648.0, + "step": 4963 + }, + { + "epoch": 0.921819870009285, + "grad_norm": 1.549869179725647, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8785412311553955, + "num_tokens": 181004006.0, + "step": 4964 + }, + { + "epoch": 0.9220055710306406, + "grad_norm": 1.4070876836776733, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8787603974342346, + "num_tokens": 181042933.0, + "step": 4965 + }, + { + "epoch": 0.9221912720519962, + "grad_norm": 1.4920626878738403, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8803690671920776, + "num_tokens": 181077376.0, + "step": 4966 + }, + { + "epoch": 0.9223769730733519, + "grad_norm": 1.4928958415985107, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8657654523849487, + "num_tokens": 181116475.0, + "step": 4967 + }, + { + "epoch": 0.9225626740947075, + "grad_norm": 1.5581198930740356, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8642913103103638, + "num_tokens": 181152358.0, + "step": 4968 + }, + { + "epoch": 0.9227483751160631, + "grad_norm": 1.5503208637237549, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8693907260894775, + "num_tokens": 181185653.0, + "step": 4969 + }, + { + "epoch": 0.9229340761374187, + "grad_norm": 1.4206691980361938, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8687236309051514, + "num_tokens": 181227116.0, + "step": 4970 + }, + { + "epoch": 0.9231197771587744, + "grad_norm": 1.458932638168335, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8771119117736816, + "num_tokens": 181264176.0, + "step": 4971 + }, + { + "epoch": 0.92330547818013, + "grad_norm": 1.630283236503601, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8757332563400269, + "num_tokens": 181294575.0, + "step": 4972 + }, + { + "epoch": 0.9234911792014856, + "grad_norm": 1.5032414197921753, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8603846430778503, + "num_tokens": 181332824.0, + "step": 4973 + }, + { + "epoch": 0.9236768802228412, + "grad_norm": 1.5808231830596924, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8634512424468994, + "num_tokens": 181364989.0, + "step": 4974 + }, + { + "epoch": 0.9238625812441968, + "grad_norm": 1.3931430578231812, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8858211040496826, + "num_tokens": 181402812.0, + "step": 4975 + }, + { + "epoch": 0.9240482822655525, + "grad_norm": 1.5517849922180176, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8626536726951599, + "num_tokens": 181439913.0, + "step": 4976 + }, + { + "epoch": 0.9242339832869081, + "grad_norm": 1.4746778011322021, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8583942651748657, + "num_tokens": 181478593.0, + "step": 4977 + }, + { + "epoch": 0.9244196843082637, + "grad_norm": 1.5761481523513794, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8777187466621399, + "num_tokens": 181511125.0, + "step": 4978 + }, + { + "epoch": 0.9246053853296193, + "grad_norm": 1.8284717798233032, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.868282675743103, + "num_tokens": 181538818.0, + "step": 4979 + }, + { + "epoch": 0.924791086350975, + "grad_norm": 1.6156095266342163, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8593003749847412, + "num_tokens": 181575814.0, + "step": 4980 + }, + { + "epoch": 0.9249767873723306, + "grad_norm": 1.4583982229232788, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8756512999534607, + "num_tokens": 181611490.0, + "step": 4981 + }, + { + "epoch": 0.9251624883936862, + "grad_norm": 1.5358439683914185, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.880085825920105, + "num_tokens": 181643065.0, + "step": 4982 + }, + { + "epoch": 0.9253481894150418, + "grad_norm": 1.556506872177124, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.874761164188385, + "num_tokens": 181676571.0, + "step": 4983 + }, + { + "epoch": 0.9255338904363974, + "grad_norm": 1.5595991611480713, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8780208826065063, + "num_tokens": 181707746.0, + "step": 4984 + }, + { + "epoch": 0.9257195914577531, + "grad_norm": 1.4416407346725464, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8698711395263672, + "num_tokens": 181747628.0, + "step": 4985 + }, + { + "epoch": 0.9259052924791087, + "grad_norm": 1.3963608741760254, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8753262758255005, + "num_tokens": 181788757.0, + "step": 4986 + }, + { + "epoch": 0.9260909935004642, + "grad_norm": 1.7048238515853882, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8696576356887817, + "num_tokens": 181819652.0, + "step": 4987 + }, + { + "epoch": 0.9262766945218198, + "grad_norm": 1.5323131084442139, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8676033616065979, + "num_tokens": 181854632.0, + "step": 4988 + }, + { + "epoch": 0.9264623955431754, + "grad_norm": 1.3310599327087402, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8707795143127441, + "num_tokens": 181895949.0, + "step": 4989 + }, + { + "epoch": 0.9266480965645311, + "grad_norm": 1.587875247001648, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8683819770812988, + "num_tokens": 181929046.0, + "step": 4990 + }, + { + "epoch": 0.9268337975858867, + "grad_norm": 1.4843862056732178, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8643976449966431, + "num_tokens": 181967150.0, + "step": 4991 + }, + { + "epoch": 0.9270194986072423, + "grad_norm": 1.5819703340530396, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8658369779586792, + "num_tokens": 182000264.0, + "step": 4992 + }, + { + "epoch": 0.9272051996285979, + "grad_norm": 1.5271960496902466, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8629077672958374, + "num_tokens": 182037441.0, + "step": 4993 + }, + { + "epoch": 0.9273909006499536, + "grad_norm": 1.490652322769165, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8616006374359131, + "num_tokens": 182073022.0, + "step": 4994 + }, + { + "epoch": 0.9275766016713092, + "grad_norm": 1.5195122957229614, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8625023365020752, + "num_tokens": 182109308.0, + "step": 4995 + }, + { + "epoch": 0.9277623026926648, + "grad_norm": 1.413578987121582, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8737627863883972, + "num_tokens": 182151551.0, + "step": 4996 + }, + { + "epoch": 0.9279480037140204, + "grad_norm": 1.4957199096679688, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8624321222305298, + "num_tokens": 182187064.0, + "step": 4997 + }, + { + "epoch": 0.928133704735376, + "grad_norm": 1.3951218128204346, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8759379386901855, + "num_tokens": 182226340.0, + "step": 4998 + }, + { + "epoch": 0.9283194057567317, + "grad_norm": 1.5942736864089966, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8679074048995972, + "num_tokens": 182260173.0, + "step": 4999 + }, + { + "epoch": 0.9285051067780873, + "grad_norm": 1.4776010513305664, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8626139163970947, + "num_tokens": 182299889.0, + "step": 5000 + }, + { + "epoch": 0.9286908077994429, + "grad_norm": 1.698835015296936, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8736059665679932, + "num_tokens": 182340106.0, + "step": 5001 + }, + { + "epoch": 0.9288765088207985, + "grad_norm": 1.593163013458252, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8715503811836243, + "num_tokens": 182374336.0, + "step": 5002 + }, + { + "epoch": 0.9290622098421542, + "grad_norm": 1.5702800750732422, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8557417392730713, + "num_tokens": 182409467.0, + "step": 5003 + }, + { + "epoch": 0.9292479108635098, + "grad_norm": 1.8814680576324463, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8739612102508545, + "num_tokens": 182446051.0, + "step": 5004 + }, + { + "epoch": 0.9294336118848654, + "grad_norm": 1.6289459466934204, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8673680424690247, + "num_tokens": 182478765.0, + "step": 5005 + }, + { + "epoch": 0.929619312906221, + "grad_norm": 1.6522022485733032, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8390357494354248, + "num_tokens": 182513840.0, + "step": 5006 + }, + { + "epoch": 0.9298050139275766, + "grad_norm": 1.6229153871536255, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8803023099899292, + "num_tokens": 182542867.0, + "step": 5007 + }, + { + "epoch": 0.9299907149489323, + "grad_norm": 1.3833563327789307, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8721893429756165, + "num_tokens": 182585133.0, + "step": 5008 + }, + { + "epoch": 0.9301764159702879, + "grad_norm": 1.5089350938796997, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8673723936080933, + "num_tokens": 182621730.0, + "step": 5009 + }, + { + "epoch": 0.9303621169916435, + "grad_norm": 1.61940598487854, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8640064001083374, + "num_tokens": 182656043.0, + "step": 5010 + }, + { + "epoch": 0.930547818012999, + "grad_norm": 1.4795687198638916, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8655162453651428, + "num_tokens": 182695312.0, + "step": 5011 + }, + { + "epoch": 0.9307335190343546, + "grad_norm": 1.4733166694641113, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8692431449890137, + "num_tokens": 182732225.0, + "step": 5012 + }, + { + "epoch": 0.9309192200557103, + "grad_norm": 1.4389517307281494, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8868242502212524, + "num_tokens": 182765475.0, + "step": 5013 + }, + { + "epoch": 0.9311049210770659, + "grad_norm": 1.3885517120361328, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8682158589363098, + "num_tokens": 182806588.0, + "step": 5014 + }, + { + "epoch": 0.9312906220984215, + "grad_norm": 1.4838147163391113, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8706685304641724, + "num_tokens": 182842361.0, + "step": 5015 + }, + { + "epoch": 0.9314763231197771, + "grad_norm": 1.423453450202942, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8745112419128418, + "num_tokens": 182881281.0, + "step": 5016 + }, + { + "epoch": 0.9316620241411327, + "grad_norm": 1.4898408651351929, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.863936722278595, + "num_tokens": 182917294.0, + "step": 5017 + }, + { + "epoch": 0.9318477251624884, + "grad_norm": 1.388559103012085, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.875414252281189, + "num_tokens": 182959227.0, + "step": 5018 + }, + { + "epoch": 0.932033426183844, + "grad_norm": 1.6084460020065308, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8586152195930481, + "num_tokens": 182995269.0, + "step": 5019 + }, + { + "epoch": 0.9322191272051996, + "grad_norm": 1.3039144277572632, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8868895769119263, + "num_tokens": 183035907.0, + "step": 5020 + }, + { + "epoch": 0.9324048282265552, + "grad_norm": 1.4915369749069214, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.847182035446167, + "num_tokens": 183075254.0, + "step": 5021 + }, + { + "epoch": 0.9325905292479109, + "grad_norm": 1.4152050018310547, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8806861639022827, + "num_tokens": 183111986.0, + "step": 5022 + }, + { + "epoch": 0.9327762302692665, + "grad_norm": 1.6644103527069092, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8691291809082031, + "num_tokens": 183146730.0, + "step": 5023 + }, + { + "epoch": 0.9329619312906221, + "grad_norm": 1.4175339937210083, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8727459907531738, + "num_tokens": 183191584.0, + "step": 5024 + }, + { + "epoch": 0.9331476323119777, + "grad_norm": 1.4710140228271484, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8734217286109924, + "num_tokens": 183227697.0, + "step": 5025 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3738387823104858, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8731651306152344, + "num_tokens": 183267667.0, + "step": 5026 + }, + { + "epoch": 0.933519034354689, + "grad_norm": 1.4343663454055786, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8664247989654541, + "num_tokens": 183309280.0, + "step": 5027 + }, + { + "epoch": 0.9337047353760446, + "grad_norm": 1.381964087486267, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8682081699371338, + "num_tokens": 183353531.0, + "step": 5028 + }, + { + "epoch": 0.9338904363974002, + "grad_norm": 1.5314193964004517, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8673374652862549, + "num_tokens": 183389893.0, + "step": 5029 + }, + { + "epoch": 0.9340761374187558, + "grad_norm": 1.6230603456497192, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8824626207351685, + "num_tokens": 183420700.0, + "step": 5030 + }, + { + "epoch": 0.9342618384401115, + "grad_norm": 1.4446016550064087, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8710747957229614, + "num_tokens": 183459791.0, + "step": 5031 + }, + { + "epoch": 0.9344475394614671, + "grad_norm": 1.5258264541625977, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8654316067695618, + "num_tokens": 183497735.0, + "step": 5032 + }, + { + "epoch": 0.9346332404828227, + "grad_norm": 1.487640142440796, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8671396970748901, + "num_tokens": 183534014.0, + "step": 5033 + }, + { + "epoch": 0.9348189415041783, + "grad_norm": 1.4882794618606567, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8753801584243774, + "num_tokens": 183569293.0, + "step": 5034 + }, + { + "epoch": 0.9350046425255338, + "grad_norm": 1.5181134939193726, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8695568442344666, + "num_tokens": 183605078.0, + "step": 5035 + }, + { + "epoch": 0.9351903435468895, + "grad_norm": 1.439254879951477, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8841797709465027, + "num_tokens": 183638056.0, + "step": 5036 + }, + { + "epoch": 0.9353760445682451, + "grad_norm": 1.589615821838379, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8674138784408569, + "num_tokens": 183672180.0, + "step": 5037 + }, + { + "epoch": 0.9355617455896007, + "grad_norm": 1.475696086883545, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8651262521743774, + "num_tokens": 183708664.0, + "step": 5038 + }, + { + "epoch": 0.9357474466109563, + "grad_norm": 1.472530722618103, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8783506155014038, + "num_tokens": 183744612.0, + "step": 5039 + }, + { + "epoch": 0.935933147632312, + "grad_norm": 1.5068176984786987, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8567975759506226, + "num_tokens": 183783324.0, + "step": 5040 + }, + { + "epoch": 0.9361188486536676, + "grad_norm": 1.4164276123046875, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8684004545211792, + "num_tokens": 183824412.0, + "step": 5041 + }, + { + "epoch": 0.9363045496750232, + "grad_norm": 1.4596673250198364, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8609611988067627, + "num_tokens": 183861859.0, + "step": 5042 + }, + { + "epoch": 0.9364902506963788, + "grad_norm": 1.3882853984832764, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8856099843978882, + "num_tokens": 183899347.0, + "step": 5043 + }, + { + "epoch": 0.9366759517177344, + "grad_norm": 1.5819203853607178, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8818495273590088, + "num_tokens": 183930061.0, + "step": 5044 + }, + { + "epoch": 0.9368616527390901, + "grad_norm": 1.5713094472885132, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8659991025924683, + "num_tokens": 183962323.0, + "step": 5045 + }, + { + "epoch": 0.9370473537604457, + "grad_norm": 1.4869121313095093, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8762117624282837, + "num_tokens": 183999401.0, + "step": 5046 + }, + { + "epoch": 0.9372330547818013, + "grad_norm": 1.5511736869812012, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8829044103622437, + "num_tokens": 184028324.0, + "step": 5047 + }, + { + "epoch": 0.9374187558031569, + "grad_norm": 1.5520119667053223, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8602646589279175, + "num_tokens": 184064335.0, + "step": 5048 + }, + { + "epoch": 0.9376044568245125, + "grad_norm": 1.797420859336853, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8670335412025452, + "num_tokens": 184095216.0, + "step": 5049 + }, + { + "epoch": 0.9377901578458682, + "grad_norm": 1.7972824573516846, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8409165143966675, + "num_tokens": 184127638.0, + "step": 5050 + }, + { + "epoch": 0.9379758588672238, + "grad_norm": 1.4593162536621094, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8868818879127502, + "num_tokens": 184162843.0, + "step": 5051 + }, + { + "epoch": 0.9381615598885794, + "grad_norm": 1.5221742391586304, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8547808527946472, + "num_tokens": 184203467.0, + "step": 5052 + }, + { + "epoch": 0.938347260909935, + "grad_norm": 1.4427094459533691, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8674789667129517, + "num_tokens": 184244404.0, + "step": 5053 + }, + { + "epoch": 0.9385329619312907, + "grad_norm": 1.7162773609161377, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8624836206436157, + "num_tokens": 184275555.0, + "step": 5054 + }, + { + "epoch": 0.9387186629526463, + "grad_norm": 1.6608753204345703, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8545268774032593, + "num_tokens": 184312372.0, + "step": 5055 + }, + { + "epoch": 0.9389043639740019, + "grad_norm": 1.4591492414474487, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8689666986465454, + "num_tokens": 184350426.0, + "step": 5056 + }, + { + "epoch": 0.9390900649953575, + "grad_norm": 1.5408964157104492, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8584247827529907, + "num_tokens": 184384433.0, + "step": 5057 + }, + { + "epoch": 0.9392757660167131, + "grad_norm": 1.3837006092071533, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8782179355621338, + "num_tokens": 184426377.0, + "step": 5058 + }, + { + "epoch": 0.9394614670380688, + "grad_norm": 1.4949613809585571, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8705312013626099, + "num_tokens": 184460312.0, + "step": 5059 + }, + { + "epoch": 0.9396471680594243, + "grad_norm": 1.5866506099700928, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8653671741485596, + "num_tokens": 184490414.0, + "step": 5060 + }, + { + "epoch": 0.9398328690807799, + "grad_norm": 1.660073161125183, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.873071551322937, + "num_tokens": 184519405.0, + "step": 5061 + }, + { + "epoch": 0.9400185701021355, + "grad_norm": 1.6120672225952148, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.859367847442627, + "num_tokens": 184554972.0, + "step": 5062 + }, + { + "epoch": 0.9402042711234911, + "grad_norm": 1.494094729423523, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8714521527290344, + "num_tokens": 184592485.0, + "step": 5063 + }, + { + "epoch": 0.9403899721448468, + "grad_norm": 1.30975341796875, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8899401426315308, + "num_tokens": 184630701.0, + "step": 5064 + }, + { + "epoch": 0.9405756731662024, + "grad_norm": 1.4215114116668701, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8766423463821411, + "num_tokens": 184666131.0, + "step": 5065 + }, + { + "epoch": 0.940761374187558, + "grad_norm": 1.4477006196975708, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8692421913146973, + "num_tokens": 184708076.0, + "step": 5066 + }, + { + "epoch": 0.9409470752089136, + "grad_norm": 1.5653982162475586, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8755450248718262, + "num_tokens": 184740896.0, + "step": 5067 + }, + { + "epoch": 0.9411327762302693, + "grad_norm": 1.5605906248092651, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8643282055854797, + "num_tokens": 184780731.0, + "step": 5068 + }, + { + "epoch": 0.9413184772516249, + "grad_norm": 1.5928360223770142, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8624651432037354, + "num_tokens": 184818734.0, + "step": 5069 + }, + { + "epoch": 0.9415041782729805, + "grad_norm": 1.5664265155792236, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8722189664840698, + "num_tokens": 184852475.0, + "step": 5070 + }, + { + "epoch": 0.9416898792943361, + "grad_norm": 1.5368975400924683, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8572214245796204, + "num_tokens": 184892181.0, + "step": 5071 + }, + { + "epoch": 0.9418755803156917, + "grad_norm": 1.5636272430419922, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8888176679611206, + "num_tokens": 184927091.0, + "step": 5072 + }, + { + "epoch": 0.9420612813370474, + "grad_norm": 1.4834312200546265, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8775181174278259, + "num_tokens": 184961438.0, + "step": 5073 + }, + { + "epoch": 0.942246982358403, + "grad_norm": 1.5820034742355347, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8733668327331543, + "num_tokens": 184995298.0, + "step": 5074 + }, + { + "epoch": 0.9424326833797586, + "grad_norm": 1.5244770050048828, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.883123517036438, + "num_tokens": 185032681.0, + "step": 5075 + }, + { + "epoch": 0.9426183844011142, + "grad_norm": 2.039858102798462, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8679413795471191, + "num_tokens": 185067443.0, + "step": 5076 + }, + { + "epoch": 0.9428040854224699, + "grad_norm": 1.4087713956832886, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8602861166000366, + "num_tokens": 185109775.0, + "step": 5077 + }, + { + "epoch": 0.9429897864438255, + "grad_norm": 1.4848016500473022, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8732384443283081, + "num_tokens": 185144912.0, + "step": 5078 + }, + { + "epoch": 0.9431754874651811, + "grad_norm": 1.6038932800292969, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8667483925819397, + "num_tokens": 185175681.0, + "step": 5079 + }, + { + "epoch": 0.9433611884865367, + "grad_norm": 1.6367701292037964, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8676077127456665, + "num_tokens": 185208971.0, + "step": 5080 + }, + { + "epoch": 0.9435468895078923, + "grad_norm": 1.5135985612869263, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8479456901550293, + "num_tokens": 185246153.0, + "step": 5081 + }, + { + "epoch": 0.943732590529248, + "grad_norm": 1.614659070968628, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8651293516159058, + "num_tokens": 185283620.0, + "step": 5082 + }, + { + "epoch": 0.9439182915506036, + "grad_norm": 1.5212180614471436, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8817758560180664, + "num_tokens": 185319274.0, + "step": 5083 + }, + { + "epoch": 0.9441039925719591, + "grad_norm": 1.4433950185775757, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8667709827423096, + "num_tokens": 185358105.0, + "step": 5084 + }, + { + "epoch": 0.9442896935933147, + "grad_norm": 1.5538681745529175, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8544806838035583, + "num_tokens": 185396517.0, + "step": 5085 + }, + { + "epoch": 0.9444753946146703, + "grad_norm": 1.6830438375473022, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8575011491775513, + "num_tokens": 185426484.0, + "step": 5086 + }, + { + "epoch": 0.944661095636026, + "grad_norm": 1.5402967929840088, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8707124590873718, + "num_tokens": 185460037.0, + "step": 5087 + }, + { + "epoch": 0.9448467966573816, + "grad_norm": 1.4387720823287964, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8748800754547119, + "num_tokens": 185495812.0, + "step": 5088 + }, + { + "epoch": 0.9450324976787372, + "grad_norm": 1.5268630981445312, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.873808741569519, + "num_tokens": 185529972.0, + "step": 5089 + }, + { + "epoch": 0.9452181987000928, + "grad_norm": 1.444383144378662, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8837918043136597, + "num_tokens": 185566486.0, + "step": 5090 + }, + { + "epoch": 0.9454038997214484, + "grad_norm": 1.6568892002105713, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.866378903388977, + "num_tokens": 185594282.0, + "step": 5091 + }, + { + "epoch": 0.9455896007428041, + "grad_norm": 1.3918776512145996, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8703656792640686, + "num_tokens": 185636372.0, + "step": 5092 + }, + { + "epoch": 0.9457753017641597, + "grad_norm": 1.5045695304870605, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8729528784751892, + "num_tokens": 185672991.0, + "step": 5093 + }, + { + "epoch": 0.9459610027855153, + "grad_norm": 1.5060428380966187, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8641971349716187, + "num_tokens": 185712562.0, + "step": 5094 + }, + { + "epoch": 0.9461467038068709, + "grad_norm": 1.5543049573898315, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8695525527000427, + "num_tokens": 185747523.0, + "step": 5095 + }, + { + "epoch": 0.9463324048282266, + "grad_norm": 1.4681313037872314, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8768745064735413, + "num_tokens": 185785100.0, + "step": 5096 + }, + { + "epoch": 0.9465181058495822, + "grad_norm": 1.587575078010559, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.885542094707489, + "num_tokens": 185818649.0, + "step": 5097 + }, + { + "epoch": 0.9467038068709378, + "grad_norm": 1.4498227834701538, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8590168952941895, + "num_tokens": 185860333.0, + "step": 5098 + }, + { + "epoch": 0.9468895078922934, + "grad_norm": 1.5781965255737305, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8669053316116333, + "num_tokens": 185894593.0, + "step": 5099 + }, + { + "epoch": 0.947075208913649, + "grad_norm": 1.6213699579238892, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8633565902709961, + "num_tokens": 185926751.0, + "step": 5100 + }, + { + "epoch": 0.9472609099350047, + "grad_norm": 1.5369257926940918, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8733770251274109, + "num_tokens": 185961732.0, + "step": 5101 + }, + { + "epoch": 0.9474466109563603, + "grad_norm": 1.4973279237747192, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8747521638870239, + "num_tokens": 185997408.0, + "step": 5102 + }, + { + "epoch": 0.9476323119777159, + "grad_norm": 1.7364635467529297, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8547295928001404, + "num_tokens": 186027148.0, + "step": 5103 + }, + { + "epoch": 0.9478180129990715, + "grad_norm": 1.4868383407592773, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8684355020523071, + "num_tokens": 186059558.0, + "step": 5104 + }, + { + "epoch": 0.9480037140204272, + "grad_norm": 1.4553009271621704, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8579620718955994, + "num_tokens": 186100589.0, + "step": 5105 + }, + { + "epoch": 0.9481894150417828, + "grad_norm": 1.6261322498321533, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8694543838500977, + "num_tokens": 186133345.0, + "step": 5106 + }, + { + "epoch": 0.9483751160631384, + "grad_norm": 1.6758711338043213, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8734705448150635, + "num_tokens": 186167802.0, + "step": 5107 + }, + { + "epoch": 0.9485608170844939, + "grad_norm": 1.604498267173767, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.871566116809845, + "num_tokens": 186201703.0, + "step": 5108 + }, + { + "epoch": 0.9487465181058495, + "grad_norm": 1.5539711713790894, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8587551116943359, + "num_tokens": 186237982.0, + "step": 5109 + }, + { + "epoch": 0.9489322191272052, + "grad_norm": 1.4623346328735352, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8638275861740112, + "num_tokens": 186277792.0, + "step": 5110 + }, + { + "epoch": 0.9491179201485608, + "grad_norm": 1.4660042524337769, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8656810522079468, + "num_tokens": 186315156.0, + "step": 5111 + }, + { + "epoch": 0.9493036211699164, + "grad_norm": 1.4898505210876465, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8721851110458374, + "num_tokens": 186353867.0, + "step": 5112 + }, + { + "epoch": 0.949489322191272, + "grad_norm": 1.5106674432754517, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8629862070083618, + "num_tokens": 186392013.0, + "step": 5113 + }, + { + "epoch": 0.9496750232126276, + "grad_norm": 1.5613154172897339, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8580650091171265, + "num_tokens": 186434219.0, + "step": 5114 + }, + { + "epoch": 0.9498607242339833, + "grad_norm": 1.5658713579177856, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8601388931274414, + "num_tokens": 186471206.0, + "step": 5115 + }, + { + "epoch": 0.9500464252553389, + "grad_norm": 1.5807567834854126, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8684335947036743, + "num_tokens": 186506311.0, + "step": 5116 + }, + { + "epoch": 0.9502321262766945, + "grad_norm": 1.5437943935394287, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.861417293548584, + "num_tokens": 186542494.0, + "step": 5117 + }, + { + "epoch": 0.9504178272980501, + "grad_norm": 1.7286574840545654, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8570604920387268, + "num_tokens": 186573622.0, + "step": 5118 + }, + { + "epoch": 0.9506035283194058, + "grad_norm": 1.4624511003494263, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8587913513183594, + "num_tokens": 186611801.0, + "step": 5119 + }, + { + "epoch": 0.9507892293407614, + "grad_norm": 1.5026538372039795, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8756948113441467, + "num_tokens": 186649709.0, + "step": 5120 + }, + { + "epoch": 0.950974930362117, + "grad_norm": 1.547278642654419, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8590363264083862, + "num_tokens": 186686053.0, + "step": 5121 + }, + { + "epoch": 0.9511606313834726, + "grad_norm": 1.5042760372161865, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8711823225021362, + "num_tokens": 186720568.0, + "step": 5122 + }, + { + "epoch": 0.9513463324048282, + "grad_norm": 1.5033389329910278, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8570225238800049, + "num_tokens": 186755422.0, + "step": 5123 + }, + { + "epoch": 0.9515320334261839, + "grad_norm": 1.967244029045105, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.864590585231781, + "num_tokens": 186795091.0, + "step": 5124 + }, + { + "epoch": 0.9517177344475395, + "grad_norm": 1.482047200202942, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8735118508338928, + "num_tokens": 186830219.0, + "step": 5125 + }, + { + "epoch": 0.9519034354688951, + "grad_norm": 1.4370278120040894, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8650606870651245, + "num_tokens": 186871718.0, + "step": 5126 + }, + { + "epoch": 0.9520891364902507, + "grad_norm": 1.6674867868423462, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8672115206718445, + "num_tokens": 186905925.0, + "step": 5127 + }, + { + "epoch": 0.9522748375116064, + "grad_norm": 1.533760905265808, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8790405988693237, + "num_tokens": 186942865.0, + "step": 5128 + }, + { + "epoch": 0.952460538532962, + "grad_norm": 1.5659860372543335, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8749146461486816, + "num_tokens": 186975672.0, + "step": 5129 + }, + { + "epoch": 0.9526462395543176, + "grad_norm": 1.4877245426177979, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8634838461875916, + "num_tokens": 187013535.0, + "step": 5130 + }, + { + "epoch": 0.9528319405756732, + "grad_norm": 1.4597313404083252, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8634293079376221, + "num_tokens": 187051630.0, + "step": 5131 + }, + { + "epoch": 0.9530176415970287, + "grad_norm": 1.565202236175537, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8688458204269409, + "num_tokens": 187084926.0, + "step": 5132 + }, + { + "epoch": 0.9532033426183844, + "grad_norm": 1.5052757263183594, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8634445667266846, + "num_tokens": 187123458.0, + "step": 5133 + }, + { + "epoch": 0.95338904363974, + "grad_norm": 1.4933662414550781, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.868257999420166, + "num_tokens": 187165003.0, + "step": 5134 + }, + { + "epoch": 0.9535747446610956, + "grad_norm": 1.5129424333572388, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.884354829788208, + "num_tokens": 187196081.0, + "step": 5135 + }, + { + "epoch": 0.9537604456824512, + "grad_norm": 1.45187509059906, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8810194134712219, + "num_tokens": 187231541.0, + "step": 5136 + }, + { + "epoch": 0.9539461467038068, + "grad_norm": 1.600490689277649, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8520503044128418, + "num_tokens": 187271473.0, + "step": 5137 + }, + { + "epoch": 0.9541318477251625, + "grad_norm": 1.6229780912399292, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8596135973930359, + "num_tokens": 187306781.0, + "step": 5138 + }, + { + "epoch": 0.9543175487465181, + "grad_norm": 1.5587060451507568, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8802975416183472, + "num_tokens": 187336610.0, + "step": 5139 + }, + { + "epoch": 0.9545032497678737, + "grad_norm": 1.5132592916488647, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8647344708442688, + "num_tokens": 187372335.0, + "step": 5140 + }, + { + "epoch": 0.9546889507892293, + "grad_norm": 1.5882104635238647, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8766998052597046, + "num_tokens": 187403138.0, + "step": 5141 + }, + { + "epoch": 0.954874651810585, + "grad_norm": 1.3953514099121094, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.875451922416687, + "num_tokens": 187443857.0, + "step": 5142 + }, + { + "epoch": 0.9550603528319406, + "grad_norm": 1.4225605726242065, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8635108470916748, + "num_tokens": 187483927.0, + "step": 5143 + }, + { + "epoch": 0.9552460538532962, + "grad_norm": 1.2986080646514893, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.875817596912384, + "num_tokens": 187530427.0, + "step": 5144 + }, + { + "epoch": 0.9554317548746518, + "grad_norm": 1.4967033863067627, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8787118196487427, + "num_tokens": 187566367.0, + "step": 5145 + }, + { + "epoch": 0.9556174558960074, + "grad_norm": 1.5734890699386597, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8507785797119141, + "num_tokens": 187601138.0, + "step": 5146 + }, + { + "epoch": 0.9558031569173631, + "grad_norm": 1.3647648096084595, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8847601413726807, + "num_tokens": 187639287.0, + "step": 5147 + }, + { + "epoch": 0.9559888579387187, + "grad_norm": 1.4131386280059814, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8706841468811035, + "num_tokens": 187677724.0, + "step": 5148 + }, + { + "epoch": 0.9561745589600743, + "grad_norm": 1.5020315647125244, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8779088854789734, + "num_tokens": 187709870.0, + "step": 5149 + }, + { + "epoch": 0.9563602599814299, + "grad_norm": 1.5395796298980713, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8750706911087036, + "num_tokens": 187744269.0, + "step": 5150 + }, + { + "epoch": 0.9565459610027855, + "grad_norm": 1.5899204015731812, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8728928565979004, + "num_tokens": 187777453.0, + "step": 5151 + }, + { + "epoch": 0.9567316620241412, + "grad_norm": 1.5994136333465576, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8567890524864197, + "num_tokens": 187815384.0, + "step": 5152 + }, + { + "epoch": 0.9569173630454968, + "grad_norm": 1.4485464096069336, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8694382905960083, + "num_tokens": 187853222.0, + "step": 5153 + }, + { + "epoch": 0.9571030640668524, + "grad_norm": 1.437536358833313, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.87265944480896, + "num_tokens": 187891177.0, + "step": 5154 + }, + { + "epoch": 0.957288765088208, + "grad_norm": 1.5274202823638916, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8616443872451782, + "num_tokens": 187928477.0, + "step": 5155 + }, + { + "epoch": 0.9574744661095635, + "grad_norm": 1.6152827739715576, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8654819130897522, + "num_tokens": 187963565.0, + "step": 5156 + }, + { + "epoch": 0.9576601671309192, + "grad_norm": 1.5435463190078735, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8722873330116272, + "num_tokens": 187997344.0, + "step": 5157 + }, + { + "epoch": 0.9578458681522748, + "grad_norm": 1.5584352016448975, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8669052720069885, + "num_tokens": 188034833.0, + "step": 5158 + }, + { + "epoch": 0.9580315691736304, + "grad_norm": 1.38770592212677, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8659375905990601, + "num_tokens": 188077875.0, + "step": 5159 + }, + { + "epoch": 0.958217270194986, + "grad_norm": 1.5459219217300415, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8810838460922241, + "num_tokens": 188110759.0, + "step": 5160 + }, + { + "epoch": 0.9584029712163417, + "grad_norm": 1.5005052089691162, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8619213700294495, + "num_tokens": 188148718.0, + "step": 5161 + }, + { + "epoch": 0.9585886722376973, + "grad_norm": 1.4476925134658813, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.885342001914978, + "num_tokens": 188185994.0, + "step": 5162 + }, + { + "epoch": 0.9587743732590529, + "grad_norm": 1.4303102493286133, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8821114301681519, + "num_tokens": 188221457.0, + "step": 5163 + }, + { + "epoch": 0.9589600742804085, + "grad_norm": 1.5562491416931152, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8673681020736694, + "num_tokens": 188256084.0, + "step": 5164 + }, + { + "epoch": 0.9591457753017641, + "grad_norm": 1.4924906492233276, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8712657690048218, + "num_tokens": 188294481.0, + "step": 5165 + }, + { + "epoch": 0.9593314763231198, + "grad_norm": 1.397680401802063, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8668317794799805, + "num_tokens": 188333432.0, + "step": 5166 + }, + { + "epoch": 0.9595171773444754, + "grad_norm": 1.5358426570892334, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8651309609413147, + "num_tokens": 188368749.0, + "step": 5167 + }, + { + "epoch": 0.959702878365831, + "grad_norm": 1.5381593704223633, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8747442364692688, + "num_tokens": 188405373.0, + "step": 5168 + }, + { + "epoch": 0.9598885793871866, + "grad_norm": 1.57423996925354, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8713902235031128, + "num_tokens": 188436562.0, + "step": 5169 + }, + { + "epoch": 0.9600742804085423, + "grad_norm": 1.4943126440048218, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8573281764984131, + "num_tokens": 188473955.0, + "step": 5170 + }, + { + "epoch": 0.9602599814298979, + "grad_norm": 1.3705848455429077, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8701630234718323, + "num_tokens": 188516287.0, + "step": 5171 + }, + { + "epoch": 0.9604456824512535, + "grad_norm": 1.466825246810913, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8765729069709778, + "num_tokens": 188551489.0, + "step": 5172 + }, + { + "epoch": 0.9606313834726091, + "grad_norm": 1.517354130744934, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8647482395172119, + "num_tokens": 188589876.0, + "step": 5173 + }, + { + "epoch": 0.9608170844939647, + "grad_norm": 1.665964126586914, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8503322601318359, + "num_tokens": 188622947.0, + "step": 5174 + }, + { + "epoch": 0.9610027855153204, + "grad_norm": 1.439543604850769, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8641036152839661, + "num_tokens": 188665667.0, + "step": 5175 + }, + { + "epoch": 0.961188486536676, + "grad_norm": 1.5191863775253296, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8550697565078735, + "num_tokens": 188704940.0, + "step": 5176 + }, + { + "epoch": 0.9613741875580316, + "grad_norm": 1.5987889766693115, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8757932186126709, + "num_tokens": 188736380.0, + "step": 5177 + }, + { + "epoch": 0.9615598885793872, + "grad_norm": 1.5142780542373657, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.856674075126648, + "num_tokens": 188776214.0, + "step": 5178 + }, + { + "epoch": 0.9617455896007429, + "grad_norm": 1.5130977630615234, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.869674026966095, + "num_tokens": 188810449.0, + "step": 5179 + }, + { + "epoch": 0.9619312906220984, + "grad_norm": 1.7237883806228638, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8683481216430664, + "num_tokens": 188841115.0, + "step": 5180 + }, + { + "epoch": 0.962116991643454, + "grad_norm": 1.4935733079910278, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8669347167015076, + "num_tokens": 188882911.0, + "step": 5181 + }, + { + "epoch": 0.9623026926648096, + "grad_norm": 1.6492751836776733, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8805952072143555, + "num_tokens": 188919522.0, + "step": 5182 + }, + { + "epoch": 0.9624883936861652, + "grad_norm": 1.5771604776382446, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8735272884368896, + "num_tokens": 188950327.0, + "step": 5183 + }, + { + "epoch": 0.9626740947075209, + "grad_norm": 1.372849464416504, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8717761635780334, + "num_tokens": 188991425.0, + "step": 5184 + }, + { + "epoch": 0.9628597957288765, + "grad_norm": 1.528132438659668, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8561440110206604, + "num_tokens": 189029591.0, + "step": 5185 + }, + { + "epoch": 0.9630454967502321, + "grad_norm": 1.5633811950683594, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8608465790748596, + "num_tokens": 189067364.0, + "step": 5186 + }, + { + "epoch": 0.9632311977715877, + "grad_norm": 1.4579689502716064, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8842951059341431, + "num_tokens": 189101848.0, + "step": 5187 + }, + { + "epoch": 0.9634168987929433, + "grad_norm": 1.6738084554672241, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8620498776435852, + "num_tokens": 189133467.0, + "step": 5188 + }, + { + "epoch": 0.963602599814299, + "grad_norm": 1.5641590356826782, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8647221326828003, + "num_tokens": 189165243.0, + "step": 5189 + }, + { + "epoch": 0.9637883008356546, + "grad_norm": 1.3531553745269775, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8812490701675415, + "num_tokens": 189204295.0, + "step": 5190 + }, + { + "epoch": 0.9639740018570102, + "grad_norm": 1.4856709241867065, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8727507591247559, + "num_tokens": 189241513.0, + "step": 5191 + }, + { + "epoch": 0.9641597028783658, + "grad_norm": 1.4677995443344116, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8766330480575562, + "num_tokens": 189279116.0, + "step": 5192 + }, + { + "epoch": 0.9643454038997215, + "grad_norm": 1.5190961360931396, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8543633818626404, + "num_tokens": 189318967.0, + "step": 5193 + }, + { + "epoch": 0.9645311049210771, + "grad_norm": 1.5633326768875122, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8658065795898438, + "num_tokens": 189354355.0, + "step": 5194 + }, + { + "epoch": 0.9647168059424327, + "grad_norm": 1.5004310607910156, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8646374940872192, + "num_tokens": 189392331.0, + "step": 5195 + }, + { + "epoch": 0.9649025069637883, + "grad_norm": 1.5416510105133057, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8766357898712158, + "num_tokens": 189425104.0, + "step": 5196 + }, + { + "epoch": 0.9650882079851439, + "grad_norm": 1.6670135259628296, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.879102885723114, + "num_tokens": 189457946.0, + "step": 5197 + }, + { + "epoch": 0.9652739090064996, + "grad_norm": 1.5755316019058228, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8641794919967651, + "num_tokens": 189492976.0, + "step": 5198 + }, + { + "epoch": 0.9654596100278552, + "grad_norm": 1.431510090827942, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8734835982322693, + "num_tokens": 189528827.0, + "step": 5199 + }, + { + "epoch": 0.9656453110492108, + "grad_norm": 1.4581592082977295, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8770501613616943, + "num_tokens": 189567356.0, + "step": 5200 + }, + { + "epoch": 0.9658310120705664, + "grad_norm": 1.4756945371627808, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8830720782279968, + "num_tokens": 189606020.0, + "step": 5201 + }, + { + "epoch": 0.966016713091922, + "grad_norm": 1.422236442565918, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8620260953903198, + "num_tokens": 189647921.0, + "step": 5202 + }, + { + "epoch": 0.9662024141132777, + "grad_norm": 1.565896987915039, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8685473799705505, + "num_tokens": 189685484.0, + "step": 5203 + }, + { + "epoch": 0.9663881151346332, + "grad_norm": 1.5633331537246704, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8622390031814575, + "num_tokens": 189724890.0, + "step": 5204 + }, + { + "epoch": 0.9665738161559888, + "grad_norm": 1.5770885944366455, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8760900497436523, + "num_tokens": 189758932.0, + "step": 5205 + }, + { + "epoch": 0.9667595171773444, + "grad_norm": 1.42168128490448, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8725780844688416, + "num_tokens": 189800620.0, + "step": 5206 + }, + { + "epoch": 0.9669452181987, + "grad_norm": 1.362890601158142, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8744359016418457, + "num_tokens": 189842893.0, + "step": 5207 + }, + { + "epoch": 0.9671309192200557, + "grad_norm": 1.4945846796035767, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8611666560173035, + "num_tokens": 189879660.0, + "step": 5208 + }, + { + "epoch": 0.9673166202414113, + "grad_norm": 1.5406415462493896, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8748410940170288, + "num_tokens": 189916104.0, + "step": 5209 + }, + { + "epoch": 0.9675023212627669, + "grad_norm": 1.4889827966690063, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8805225491523743, + "num_tokens": 189952528.0, + "step": 5210 + }, + { + "epoch": 0.9676880222841225, + "grad_norm": 1.5388801097869873, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8657339811325073, + "num_tokens": 189988388.0, + "step": 5211 + }, + { + "epoch": 0.9678737233054782, + "grad_norm": 1.4752817153930664, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8736704587936401, + "num_tokens": 190028775.0, + "step": 5212 + }, + { + "epoch": 0.9680594243268338, + "grad_norm": 1.4894046783447266, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8593652248382568, + "num_tokens": 190069326.0, + "step": 5213 + }, + { + "epoch": 0.9682451253481894, + "grad_norm": 1.5384259223937988, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8597291111946106, + "num_tokens": 190103777.0, + "step": 5214 + }, + { + "epoch": 0.968430826369545, + "grad_norm": 1.4882091283798218, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.872920036315918, + "num_tokens": 190142427.0, + "step": 5215 + }, + { + "epoch": 0.9686165273909007, + "grad_norm": 1.5547361373901367, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8598822951316833, + "num_tokens": 190178393.0, + "step": 5216 + }, + { + "epoch": 0.9688022284122563, + "grad_norm": 1.5871469974517822, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8693248629570007, + "num_tokens": 190212823.0, + "step": 5217 + }, + { + "epoch": 0.9689879294336119, + "grad_norm": 1.4179935455322266, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8676657676696777, + "num_tokens": 190254985.0, + "step": 5218 + }, + { + "epoch": 0.9691736304549675, + "grad_norm": 1.44025719165802, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8765113353729248, + "num_tokens": 190290877.0, + "step": 5219 + }, + { + "epoch": 0.9693593314763231, + "grad_norm": 1.47028386592865, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8896916508674622, + "num_tokens": 190325656.0, + "step": 5220 + }, + { + "epoch": 0.9695450324976788, + "grad_norm": 1.5266984701156616, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8809233903884888, + "num_tokens": 190359179.0, + "step": 5221 + }, + { + "epoch": 0.9697307335190344, + "grad_norm": 1.5223032236099243, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8602694869041443, + "num_tokens": 190400433.0, + "step": 5222 + }, + { + "epoch": 0.96991643454039, + "grad_norm": 1.5408002138137817, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8716721534729004, + "num_tokens": 190433238.0, + "step": 5223 + }, + { + "epoch": 0.9701021355617456, + "grad_norm": 1.6336424350738525, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8571895956993103, + "num_tokens": 190470648.0, + "step": 5224 + }, + { + "epoch": 0.9702878365831012, + "grad_norm": 1.4841506481170654, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8581241369247437, + "num_tokens": 190511854.0, + "step": 5225 + }, + { + "epoch": 0.9704735376044569, + "grad_norm": 1.4437774419784546, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8807753324508667, + "num_tokens": 190547332.0, + "step": 5226 + }, + { + "epoch": 0.9706592386258125, + "grad_norm": 1.5576637983322144, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8635237812995911, + "num_tokens": 190587408.0, + "step": 5227 + }, + { + "epoch": 0.9708449396471681, + "grad_norm": 1.4852975606918335, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8694655895233154, + "num_tokens": 190625698.0, + "step": 5228 + }, + { + "epoch": 0.9710306406685236, + "grad_norm": 1.4797853231430054, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8684639930725098, + "num_tokens": 190669305.0, + "step": 5229 + }, + { + "epoch": 0.9712163416898792, + "grad_norm": 1.4533270597457886, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8681753277778625, + "num_tokens": 190707623.0, + "step": 5230 + }, + { + "epoch": 0.9714020427112349, + "grad_norm": 1.6346535682678223, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8655983209609985, + "num_tokens": 190742101.0, + "step": 5231 + }, + { + "epoch": 0.9715877437325905, + "grad_norm": 1.5742236375808716, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8647854924201965, + "num_tokens": 190773738.0, + "step": 5232 + }, + { + "epoch": 0.9717734447539461, + "grad_norm": 1.8476911783218384, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8590808510780334, + "num_tokens": 190803474.0, + "step": 5233 + }, + { + "epoch": 0.9719591457753017, + "grad_norm": 1.6860170364379883, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.864772379398346, + "num_tokens": 190836938.0, + "step": 5234 + }, + { + "epoch": 0.9721448467966574, + "grad_norm": 1.8487921953201294, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8560189604759216, + "num_tokens": 190863585.0, + "step": 5235 + }, + { + "epoch": 0.972330547818013, + "grad_norm": 1.4331055879592896, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8744332790374756, + "num_tokens": 190903983.0, + "step": 5236 + }, + { + "epoch": 0.9725162488393686, + "grad_norm": 1.5333210229873657, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8735448122024536, + "num_tokens": 190939865.0, + "step": 5237 + }, + { + "epoch": 0.9727019498607242, + "grad_norm": 1.4516487121582031, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8599352836608887, + "num_tokens": 190978189.0, + "step": 5238 + }, + { + "epoch": 0.9728876508820798, + "grad_norm": 1.5245542526245117, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8684737086296082, + "num_tokens": 191009558.0, + "step": 5239 + }, + { + "epoch": 0.9730733519034355, + "grad_norm": 1.530303955078125, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8816049098968506, + "num_tokens": 191046611.0, + "step": 5240 + }, + { + "epoch": 0.9732590529247911, + "grad_norm": 1.5082144737243652, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8624939918518066, + "num_tokens": 191083890.0, + "step": 5241 + }, + { + "epoch": 0.9734447539461467, + "grad_norm": 1.6173210144042969, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8674826622009277, + "num_tokens": 191116378.0, + "step": 5242 + }, + { + "epoch": 0.9736304549675023, + "grad_norm": 1.4947211742401123, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8481699228286743, + "num_tokens": 191155178.0, + "step": 5243 + }, + { + "epoch": 0.973816155988858, + "grad_norm": 1.8026912212371826, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8679912090301514, + "num_tokens": 191194948.0, + "step": 5244 + }, + { + "epoch": 0.9740018570102136, + "grad_norm": 1.616763949394226, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8785082697868347, + "num_tokens": 191227298.0, + "step": 5245 + }, + { + "epoch": 0.9741875580315692, + "grad_norm": 1.6485260725021362, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.88050377368927, + "num_tokens": 191258949.0, + "step": 5246 + }, + { + "epoch": 0.9743732590529248, + "grad_norm": 1.4623454809188843, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8703140020370483, + "num_tokens": 191294670.0, + "step": 5247 + }, + { + "epoch": 0.9745589600742804, + "grad_norm": 1.4741270542144775, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8725131750106812, + "num_tokens": 191331378.0, + "step": 5248 + }, + { + "epoch": 0.9747446610956361, + "grad_norm": 1.530504584312439, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8650796413421631, + "num_tokens": 191369674.0, + "step": 5249 + }, + { + "epoch": 0.9749303621169917, + "grad_norm": 1.5545196533203125, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.88228440284729, + "num_tokens": 191402463.0, + "step": 5250 + }, + { + "epoch": 0.9751160631383473, + "grad_norm": 1.5726127624511719, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8657535314559937, + "num_tokens": 191436254.0, + "step": 5251 + }, + { + "epoch": 0.9753017641597029, + "grad_norm": 1.5905171632766724, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8616812229156494, + "num_tokens": 191473802.0, + "step": 5252 + }, + { + "epoch": 0.9754874651810584, + "grad_norm": 1.740166187286377, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8670347332954407, + "num_tokens": 191505090.0, + "step": 5253 + }, + { + "epoch": 0.9756731662024141, + "grad_norm": 1.5796935558319092, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8555534482002258, + "num_tokens": 191541819.0, + "step": 5254 + }, + { + "epoch": 0.9758588672237697, + "grad_norm": 1.4831234216690063, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8714967370033264, + "num_tokens": 191579368.0, + "step": 5255 + }, + { + "epoch": 0.9760445682451253, + "grad_norm": 1.46976900100708, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8653526306152344, + "num_tokens": 191617455.0, + "step": 5256 + }, + { + "epoch": 0.9762302692664809, + "grad_norm": 1.4609096050262451, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8815116882324219, + "num_tokens": 191654718.0, + "step": 5257 + }, + { + "epoch": 0.9764159702878366, + "grad_norm": 1.4795759916305542, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8639417886734009, + "num_tokens": 191691535.0, + "step": 5258 + }, + { + "epoch": 0.9766016713091922, + "grad_norm": 1.622737169265747, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8701194524765015, + "num_tokens": 191722903.0, + "step": 5259 + }, + { + "epoch": 0.9767873723305478, + "grad_norm": 1.52027428150177, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.873335599899292, + "num_tokens": 191761470.0, + "step": 5260 + }, + { + "epoch": 0.9769730733519034, + "grad_norm": 1.4295217990875244, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8837797045707703, + "num_tokens": 191804139.0, + "step": 5261 + }, + { + "epoch": 0.977158774373259, + "grad_norm": 1.622201681137085, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8645973205566406, + "num_tokens": 191840943.0, + "step": 5262 + }, + { + "epoch": 0.9773444753946147, + "grad_norm": 1.412369728088379, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8761911392211914, + "num_tokens": 191877981.0, + "step": 5263 + }, + { + "epoch": 0.9775301764159703, + "grad_norm": 1.40791654586792, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8658853769302368, + "num_tokens": 191916844.0, + "step": 5264 + }, + { + "epoch": 0.9777158774373259, + "grad_norm": 1.4965121746063232, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8601169586181641, + "num_tokens": 191955912.0, + "step": 5265 + }, + { + "epoch": 0.9779015784586815, + "grad_norm": 1.4939888715744019, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8661848902702332, + "num_tokens": 191994610.0, + "step": 5266 + }, + { + "epoch": 0.9780872794800372, + "grad_norm": 1.5125747919082642, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8841235637664795, + "num_tokens": 192027223.0, + "step": 5267 + }, + { + "epoch": 0.9782729805013928, + "grad_norm": 1.433445692062378, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8795165419578552, + "num_tokens": 192063759.0, + "step": 5268 + }, + { + "epoch": 0.9784586815227484, + "grad_norm": 1.603849172592163, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8530449867248535, + "num_tokens": 192101638.0, + "step": 5269 + }, + { + "epoch": 0.978644382544104, + "grad_norm": 1.8937653303146362, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8568406105041504, + "num_tokens": 192129125.0, + "step": 5270 + }, + { + "epoch": 0.9788300835654596, + "grad_norm": 1.5563101768493652, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8667112588882446, + "num_tokens": 192166455.0, + "step": 5271 + }, + { + "epoch": 0.9790157845868153, + "grad_norm": 1.530238389968872, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8698469996452332, + "num_tokens": 192201744.0, + "step": 5272 + }, + { + "epoch": 0.9792014856081709, + "grad_norm": 1.5304913520812988, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8747685551643372, + "num_tokens": 192235190.0, + "step": 5273 + }, + { + "epoch": 0.9793871866295265, + "grad_norm": 1.7343910932540894, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8579884171485901, + "num_tokens": 192264870.0, + "step": 5274 + }, + { + "epoch": 0.9795728876508821, + "grad_norm": 1.4618475437164307, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8718471527099609, + "num_tokens": 192306238.0, + "step": 5275 + }, + { + "epoch": 0.9797585886722378, + "grad_norm": 1.563019037246704, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.880236029624939, + "num_tokens": 192339686.0, + "step": 5276 + }, + { + "epoch": 0.9799442896935933, + "grad_norm": 1.6178805828094482, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8658722043037415, + "num_tokens": 192371702.0, + "step": 5277 + }, + { + "epoch": 0.9801299907149489, + "grad_norm": 1.595820426940918, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8735575675964355, + "num_tokens": 192404506.0, + "step": 5278 + }, + { + "epoch": 0.9803156917363045, + "grad_norm": 1.5332828760147095, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8682889342308044, + "num_tokens": 192440102.0, + "step": 5279 + }, + { + "epoch": 0.9805013927576601, + "grad_norm": 1.5031553506851196, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8814258575439453, + "num_tokens": 192475625.0, + "step": 5280 + }, + { + "epoch": 0.9806870937790158, + "grad_norm": 1.5488256216049194, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8763654232025146, + "num_tokens": 192507630.0, + "step": 5281 + }, + { + "epoch": 0.9808727948003714, + "grad_norm": 1.535904884338379, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8619362711906433, + "num_tokens": 192544172.0, + "step": 5282 + }, + { + "epoch": 0.981058495821727, + "grad_norm": 1.4458703994750977, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.869920015335083, + "num_tokens": 192583366.0, + "step": 5283 + }, + { + "epoch": 0.9812441968430826, + "grad_norm": 1.4515947103500366, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8727161884307861, + "num_tokens": 192623452.0, + "step": 5284 + }, + { + "epoch": 0.9814298978644382, + "grad_norm": 1.408026099205017, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8860448598861694, + "num_tokens": 192658014.0, + "step": 5285 + }, + { + "epoch": 0.9816155988857939, + "grad_norm": 1.5203529596328735, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8710811138153076, + "num_tokens": 192692678.0, + "step": 5286 + }, + { + "epoch": 0.9818012999071495, + "grad_norm": 1.493229866027832, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8724638223648071, + "num_tokens": 192731884.0, + "step": 5287 + }, + { + "epoch": 0.9819870009285051, + "grad_norm": 1.7254241704940796, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8477844595909119, + "num_tokens": 192764422.0, + "step": 5288 + }, + { + "epoch": 0.9821727019498607, + "grad_norm": 1.6425726413726807, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8606404066085815, + "num_tokens": 192797930.0, + "step": 5289 + }, + { + "epoch": 0.9823584029712163, + "grad_norm": 1.6241490840911865, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8690552115440369, + "num_tokens": 192833469.0, + "step": 5290 + }, + { + "epoch": 0.982544103992572, + "grad_norm": 1.5724022388458252, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8544631600379944, + "num_tokens": 192872858.0, + "step": 5291 + }, + { + "epoch": 0.9827298050139276, + "grad_norm": 1.3349506855010986, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8731430172920227, + "num_tokens": 192916870.0, + "step": 5292 + }, + { + "epoch": 0.9829155060352832, + "grad_norm": 1.573954701423645, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8660389184951782, + "num_tokens": 192950789.0, + "step": 5293 + }, + { + "epoch": 0.9831012070566388, + "grad_norm": 1.5673768520355225, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.869692325592041, + "num_tokens": 192986792.0, + "step": 5294 + }, + { + "epoch": 0.9832869080779945, + "grad_norm": 1.4859561920166016, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8683279156684875, + "num_tokens": 193026814.0, + "step": 5295 + }, + { + "epoch": 0.9834726090993501, + "grad_norm": 1.4324240684509277, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8637295961380005, + "num_tokens": 193066104.0, + "step": 5296 + }, + { + "epoch": 0.9836583101207057, + "grad_norm": 1.4663794040679932, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8720252513885498, + "num_tokens": 193102152.0, + "step": 5297 + }, + { + "epoch": 0.9838440111420613, + "grad_norm": 1.5282090902328491, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8705750703811646, + "num_tokens": 193135928.0, + "step": 5298 + }, + { + "epoch": 0.984029712163417, + "grad_norm": 1.4487547874450684, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.871371865272522, + "num_tokens": 193173611.0, + "step": 5299 + }, + { + "epoch": 0.9842154131847726, + "grad_norm": 1.4977988004684448, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8694660663604736, + "num_tokens": 193209425.0, + "step": 5300 + }, + { + "epoch": 0.9844011142061281, + "grad_norm": 1.4791836738586426, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8763845562934875, + "num_tokens": 193247257.0, + "step": 5301 + }, + { + "epoch": 0.9845868152274837, + "grad_norm": 1.6356357336044312, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8632301688194275, + "num_tokens": 193282382.0, + "step": 5302 + }, + { + "epoch": 0.9847725162488393, + "grad_norm": 1.521070957183838, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8707481622695923, + "num_tokens": 193323462.0, + "step": 5303 + }, + { + "epoch": 0.984958217270195, + "grad_norm": 1.5365204811096191, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8625046610832214, + "num_tokens": 193359304.0, + "step": 5304 + }, + { + "epoch": 0.9851439182915506, + "grad_norm": 1.7188782691955566, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8646398782730103, + "num_tokens": 193389047.0, + "step": 5305 + }, + { + "epoch": 0.9853296193129062, + "grad_norm": 1.6910911798477173, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8666192293167114, + "num_tokens": 193420572.0, + "step": 5306 + }, + { + "epoch": 0.9855153203342618, + "grad_norm": 1.6836711168289185, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8781933784484863, + "num_tokens": 193453250.0, + "step": 5307 + }, + { + "epoch": 0.9857010213556174, + "grad_norm": 1.380088448524475, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.865525484085083, + "num_tokens": 193496337.0, + "step": 5308 + }, + { + "epoch": 0.9858867223769731, + "grad_norm": 1.5767329931259155, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8546175956726074, + "num_tokens": 193531527.0, + "step": 5309 + }, + { + "epoch": 0.9860724233983287, + "grad_norm": 1.542517900466919, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8501229286193848, + "num_tokens": 193568534.0, + "step": 5310 + }, + { + "epoch": 0.9862581244196843, + "grad_norm": 1.5003929138183594, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8817465305328369, + "num_tokens": 193600586.0, + "step": 5311 + }, + { + "epoch": 0.9864438254410399, + "grad_norm": 1.498747706413269, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8659641742706299, + "num_tokens": 193640216.0, + "step": 5312 + }, + { + "epoch": 0.9866295264623955, + "grad_norm": 1.5658437013626099, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8678651452064514, + "num_tokens": 193676633.0, + "step": 5313 + }, + { + "epoch": 0.9868152274837512, + "grad_norm": 1.4405457973480225, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8778657913208008, + "num_tokens": 193711732.0, + "step": 5314 + }, + { + "epoch": 0.9870009285051068, + "grad_norm": 1.4237275123596191, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8748623132705688, + "num_tokens": 193751624.0, + "step": 5315 + }, + { + "epoch": 0.9871866295264624, + "grad_norm": 1.4952319860458374, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8894355297088623, + "num_tokens": 193785344.0, + "step": 5316 + }, + { + "epoch": 0.987372330547818, + "grad_norm": 1.5118316411972046, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8692361116409302, + "num_tokens": 193818162.0, + "step": 5317 + }, + { + "epoch": 0.9875580315691737, + "grad_norm": 1.520782232284546, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8671401739120483, + "num_tokens": 193857914.0, + "step": 5318 + }, + { + "epoch": 0.9877437325905293, + "grad_norm": 1.3410515785217285, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8721462488174438, + "num_tokens": 193903213.0, + "step": 5319 + }, + { + "epoch": 0.9879294336118849, + "grad_norm": 1.4620676040649414, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8693323135375977, + "num_tokens": 193939632.0, + "step": 5320 + }, + { + "epoch": 0.9881151346332405, + "grad_norm": 1.6860885620117188, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8746681213378906, + "num_tokens": 193969980.0, + "step": 5321 + }, + { + "epoch": 0.9883008356545961, + "grad_norm": 1.7310236692428589, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8595970273017883, + "num_tokens": 194000642.0, + "step": 5322 + }, + { + "epoch": 0.9884865366759518, + "grad_norm": 1.5571870803833008, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8829712867736816, + "num_tokens": 194035098.0, + "step": 5323 + }, + { + "epoch": 0.9886722376973074, + "grad_norm": 1.500821590423584, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8846157789230347, + "num_tokens": 194069577.0, + "step": 5324 + }, + { + "epoch": 0.9888579387186629, + "grad_norm": 1.4447683095932007, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8803019523620605, + "num_tokens": 194105429.0, + "step": 5325 + }, + { + "epoch": 0.9890436397400185, + "grad_norm": 1.5224995613098145, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8631561994552612, + "num_tokens": 194143690.0, + "step": 5326 + }, + { + "epoch": 0.9892293407613741, + "grad_norm": 1.4979002475738525, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8807899355888367, + "num_tokens": 194181090.0, + "step": 5327 + }, + { + "epoch": 0.9894150417827298, + "grad_norm": 1.3782917261123657, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8817949295043945, + "num_tokens": 194221272.0, + "step": 5328 + }, + { + "epoch": 0.9896007428040854, + "grad_norm": 1.6647156476974487, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8549771308898926, + "num_tokens": 194252106.0, + "step": 5329 + }, + { + "epoch": 0.989786443825441, + "grad_norm": 1.3927868604660034, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8857429027557373, + "num_tokens": 194289873.0, + "step": 5330 + }, + { + "epoch": 0.9899721448467966, + "grad_norm": 1.5279521942138672, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8769361972808838, + "num_tokens": 194324695.0, + "step": 5331 + }, + { + "epoch": 0.9901578458681523, + "grad_norm": 1.4229718446731567, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8652358055114746, + "num_tokens": 194366445.0, + "step": 5332 + }, + { + "epoch": 0.9903435468895079, + "grad_norm": 1.4195184707641602, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8696993589401245, + "num_tokens": 194402459.0, + "step": 5333 + }, + { + "epoch": 0.9905292479108635, + "grad_norm": 1.5183295011520386, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8585859537124634, + "num_tokens": 194440886.0, + "step": 5334 + }, + { + "epoch": 0.9907149489322191, + "grad_norm": 1.4806145429611206, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8810563087463379, + "num_tokens": 194478055.0, + "step": 5335 + }, + { + "epoch": 0.9909006499535747, + "grad_norm": 1.3642476797103882, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8603438138961792, + "num_tokens": 194521529.0, + "step": 5336 + }, + { + "epoch": 0.9910863509749304, + "grad_norm": 1.642442226409912, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8715603947639465, + "num_tokens": 194556462.0, + "step": 5337 + }, + { + "epoch": 0.991272051996286, + "grad_norm": 1.4120153188705444, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8719199895858765, + "num_tokens": 194598158.0, + "step": 5338 + }, + { + "epoch": 0.9914577530176416, + "grad_norm": 1.4816648960113525, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8668597936630249, + "num_tokens": 194637840.0, + "step": 5339 + }, + { + "epoch": 0.9916434540389972, + "grad_norm": 1.4059948921203613, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8747909665107727, + "num_tokens": 194678993.0, + "step": 5340 + }, + { + "epoch": 0.9918291550603529, + "grad_norm": 1.5286802053451538, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8678319454193115, + "num_tokens": 194712429.0, + "step": 5341 + }, + { + "epoch": 0.9920148560817085, + "grad_norm": 1.6621301174163818, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.87128746509552, + "num_tokens": 194746107.0, + "step": 5342 + }, + { + "epoch": 0.9922005571030641, + "grad_norm": 1.5797995328903198, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.877578854560852, + "num_tokens": 194780548.0, + "step": 5343 + }, + { + "epoch": 0.9923862581244197, + "grad_norm": 1.4128592014312744, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8828978538513184, + "num_tokens": 194819728.0, + "step": 5344 + }, + { + "epoch": 0.9925719591457753, + "grad_norm": 1.4715142250061035, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8556807041168213, + "num_tokens": 194860160.0, + "step": 5345 + }, + { + "epoch": 0.992757660167131, + "grad_norm": 1.453596830368042, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8744505047798157, + "num_tokens": 194896579.0, + "step": 5346 + }, + { + "epoch": 0.9929433611884866, + "grad_norm": 1.5517284870147705, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8701654672622681, + "num_tokens": 194929636.0, + "step": 5347 + }, + { + "epoch": 0.9931290622098422, + "grad_norm": 1.4623632431030273, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8706034421920776, + "num_tokens": 194966523.0, + "step": 5348 + }, + { + "epoch": 0.9933147632311977, + "grad_norm": 1.6653612852096558, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8597109317779541, + "num_tokens": 194997472.0, + "step": 5349 + }, + { + "epoch": 0.9935004642525533, + "grad_norm": 1.4100232124328613, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8763799071311951, + "num_tokens": 195032969.0, + "step": 5350 + }, + { + "epoch": 0.993686165273909, + "grad_norm": 1.6068458557128906, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8767509460449219, + "num_tokens": 195065450.0, + "step": 5351 + }, + { + "epoch": 0.9938718662952646, + "grad_norm": 1.4568164348602295, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.864477276802063, + "num_tokens": 195105542.0, + "step": 5352 + }, + { + "epoch": 0.9940575673166202, + "grad_norm": 1.484057068824768, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8765825033187866, + "num_tokens": 195140512.0, + "step": 5353 + }, + { + "epoch": 0.9942432683379758, + "grad_norm": 1.418497085571289, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8773494958877563, + "num_tokens": 195176987.0, + "step": 5354 + }, + { + "epoch": 0.9944289693593314, + "grad_norm": 1.6809356212615967, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8552666306495667, + "num_tokens": 195210449.0, + "step": 5355 + }, + { + "epoch": 0.9946146703806871, + "grad_norm": 1.6171234846115112, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8631471395492554, + "num_tokens": 195247193.0, + "step": 5356 + }, + { + "epoch": 0.9948003714020427, + "grad_norm": 1.3892173767089844, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8725223541259766, + "num_tokens": 195289082.0, + "step": 5357 + }, + { + "epoch": 0.9949860724233983, + "grad_norm": 1.5040353536605835, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8717150688171387, + "num_tokens": 195326717.0, + "step": 5358 + }, + { + "epoch": 0.9951717734447539, + "grad_norm": 1.4882457256317139, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8654287457466125, + "num_tokens": 195362118.0, + "step": 5359 + }, + { + "epoch": 0.9953574744661096, + "grad_norm": 1.4687687158584595, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8722807765007019, + "num_tokens": 195397713.0, + "step": 5360 + }, + { + "epoch": 0.9955431754874652, + "grad_norm": 1.526444435119629, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.873793363571167, + "num_tokens": 195430103.0, + "step": 5361 + }, + { + "epoch": 0.9957288765088208, + "grad_norm": 1.5344208478927612, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8734297752380371, + "num_tokens": 195465423.0, + "step": 5362 + }, + { + "epoch": 0.9959145775301764, + "grad_norm": 1.3994712829589844, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8713286519050598, + "num_tokens": 195506997.0, + "step": 5363 + }, + { + "epoch": 0.996100278551532, + "grad_norm": 1.4749263525009155, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8644744157791138, + "num_tokens": 195547590.0, + "step": 5364 + }, + { + "epoch": 0.9962859795728877, + "grad_norm": 1.6102378368377686, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8663007020950317, + "num_tokens": 195578444.0, + "step": 5365 + }, + { + "epoch": 0.9964716805942433, + "grad_norm": 1.5248438119888306, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.88518887758255, + "num_tokens": 195609212.0, + "step": 5366 + }, + { + "epoch": 0.9966573816155989, + "grad_norm": 1.5805511474609375, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8568764925003052, + "num_tokens": 195644166.0, + "step": 5367 + }, + { + "epoch": 0.9968430826369545, + "grad_norm": 1.485685110092163, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8744578957557678, + "num_tokens": 195679139.0, + "step": 5368 + }, + { + "epoch": 0.9970287836583102, + "grad_norm": 1.5334147214889526, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8738377094268799, + "num_tokens": 195714960.0, + "step": 5369 + }, + { + "epoch": 0.9972144846796658, + "grad_norm": 1.5523115396499634, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.858052670955658, + "num_tokens": 195752481.0, + "step": 5370 + }, + { + "epoch": 0.9974001857010214, + "grad_norm": 1.3887437582015991, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8782943487167358, + "num_tokens": 195791827.0, + "step": 5371 + }, + { + "epoch": 0.997585886722377, + "grad_norm": 1.4708439111709595, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8724619150161743, + "num_tokens": 195829976.0, + "step": 5372 + }, + { + "epoch": 0.9977715877437325, + "grad_norm": 1.5724799633026123, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.864175021648407, + "num_tokens": 195862204.0, + "step": 5373 + }, + { + "epoch": 0.9979572887650882, + "grad_norm": 1.4759910106658936, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8777557611465454, + "num_tokens": 195896569.0, + "step": 5374 + }, + { + "epoch": 0.9981429897864438, + "grad_norm": 1.4317477941513062, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8704284429550171, + "num_tokens": 195937159.0, + "step": 5375 + }, + { + "epoch": 0.9983286908077994, + "grad_norm": 1.509628415107727, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8581370711326599, + "num_tokens": 195976501.0, + "step": 5376 + }, + { + "epoch": 0.998514391829155, + "grad_norm": 1.5026637315750122, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8869392275810242, + "num_tokens": 196010990.0, + "step": 5377 + }, + { + "epoch": 0.9987000928505106, + "grad_norm": 1.3549422025680542, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8832964897155762, + "num_tokens": 196050606.0, + "step": 5378 + }, + { + "epoch": 0.9988857938718663, + "grad_norm": 1.6138010025024414, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8742923736572266, + "num_tokens": 196082213.0, + "step": 5379 + }, + { + "epoch": 0.9990714948932219, + "grad_norm": 1.5520721673965454, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8761347532272339, + "num_tokens": 196116413.0, + "step": 5380 + }, + { + "epoch": 0.9992571959145775, + "grad_norm": 1.4301304817199707, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8742899894714355, + "num_tokens": 196153818.0, + "step": 5381 + }, + { + "epoch": 0.9994428969359331, + "grad_norm": 1.5704745054244995, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8597815632820129, + "num_tokens": 196191168.0, + "step": 5382 + }, + { + "epoch": 0.9996285979572888, + "grad_norm": 1.5721007585525513, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8722026348114014, + "num_tokens": 196223867.0, + "step": 5383 + }, + { + "epoch": 0.9998142989786444, + "grad_norm": 1.4200420379638672, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.86830735206604, + "num_tokens": 196261518.0, + "step": 5384 + }, + { + "epoch": 1.0, + "grad_norm": 1.4908266067504883, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8723427057266235, + "num_tokens": 196298307.0, + "step": 5385 + }, + { + "epoch": 1.0001857010213555, + "grad_norm": 1.4033629894256592, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8748028874397278, + "num_tokens": 196337156.0, + "step": 5386 + }, + { + "epoch": 1.0003714020427112, + "grad_norm": 1.5467170476913452, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.865146279335022, + "num_tokens": 196374238.0, + "step": 5387 + }, + { + "epoch": 1.0005571030640668, + "grad_norm": 1.3985652923583984, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8900207281112671, + "num_tokens": 196410640.0, + "step": 5388 + }, + { + "epoch": 1.0007428040854225, + "grad_norm": 1.3960739374160767, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8817939162254333, + "num_tokens": 196451199.0, + "step": 5389 + }, + { + "epoch": 1.000928505106778, + "grad_norm": 1.4288092851638794, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8816707730293274, + "num_tokens": 196492236.0, + "step": 5390 + }, + { + "epoch": 1.0011142061281337, + "grad_norm": 1.4978746175765991, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8740010261535645, + "num_tokens": 196530501.0, + "step": 5391 + }, + { + "epoch": 1.0012999071494892, + "grad_norm": 1.5518879890441895, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8793008327484131, + "num_tokens": 196563491.0, + "step": 5392 + }, + { + "epoch": 1.001485608170845, + "grad_norm": 1.4691460132598877, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8633707761764526, + "num_tokens": 196604396.0, + "step": 5393 + }, + { + "epoch": 1.0016713091922005, + "grad_norm": 1.5216857194900513, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8911130428314209, + "num_tokens": 196638779.0, + "step": 5394 + }, + { + "epoch": 1.0018570102135562, + "grad_norm": 1.598626971244812, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8688427209854126, + "num_tokens": 196674615.0, + "step": 5395 + }, + { + "epoch": 1.0020427112349117, + "grad_norm": 1.824216365814209, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8556557893753052, + "num_tokens": 196708906.0, + "step": 5396 + }, + { + "epoch": 1.0022284122562675, + "grad_norm": 1.6250051259994507, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8653547763824463, + "num_tokens": 196746119.0, + "step": 5397 + }, + { + "epoch": 1.002414113277623, + "grad_norm": 1.709039330482483, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8679660558700562, + "num_tokens": 196782082.0, + "step": 5398 + }, + { + "epoch": 1.0025998142989787, + "grad_norm": 1.7299858331680298, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8691709637641907, + "num_tokens": 196813289.0, + "step": 5399 + }, + { + "epoch": 1.0027855153203342, + "grad_norm": 1.6392920017242432, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8739697933197021, + "num_tokens": 196848038.0, + "step": 5400 + }, + { + "epoch": 1.00297121634169, + "grad_norm": 1.6790897846221924, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8814373016357422, + "num_tokens": 196878251.0, + "step": 5401 + }, + { + "epoch": 1.0031569173630455, + "grad_norm": 1.6351114511489868, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8838207721710205, + "num_tokens": 196910009.0, + "step": 5402 + }, + { + "epoch": 1.0033426183844012, + "grad_norm": 1.6542829275131226, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8649251461029053, + "num_tokens": 196945013.0, + "step": 5403 + }, + { + "epoch": 1.0035283194057567, + "grad_norm": 1.5860456228256226, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8828861713409424, + "num_tokens": 196979376.0, + "step": 5404 + }, + { + "epoch": 1.0037140204271124, + "grad_norm": 1.5048274993896484, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8716117143630981, + "num_tokens": 197016174.0, + "step": 5405 + }, + { + "epoch": 1.003899721448468, + "grad_norm": 1.4817695617675781, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8797118067741394, + "num_tokens": 197055752.0, + "step": 5406 + }, + { + "epoch": 1.0040854224698237, + "grad_norm": 1.5942302942276, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8658763766288757, + "num_tokens": 197092031.0, + "step": 5407 + }, + { + "epoch": 1.0042711234911792, + "grad_norm": 1.4134153127670288, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8825522661209106, + "num_tokens": 197131041.0, + "step": 5408 + }, + { + "epoch": 1.004456824512535, + "grad_norm": 1.503305196762085, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8688639402389526, + "num_tokens": 197168021.0, + "step": 5409 + }, + { + "epoch": 1.0046425255338904, + "grad_norm": 1.6351279020309448, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8883623480796814, + "num_tokens": 197197813.0, + "step": 5410 + }, + { + "epoch": 1.004828226555246, + "grad_norm": 1.6077470779418945, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8909766674041748, + "num_tokens": 197229453.0, + "step": 5411 + }, + { + "epoch": 1.0050139275766017, + "grad_norm": 1.5292531251907349, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8717203140258789, + "num_tokens": 197271135.0, + "step": 5412 + }, + { + "epoch": 1.0051996285979572, + "grad_norm": 1.3304740190505981, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8814234733581543, + "num_tokens": 197315243.0, + "step": 5413 + }, + { + "epoch": 1.005385329619313, + "grad_norm": 1.5204461812973022, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8722547292709351, + "num_tokens": 197352353.0, + "step": 5414 + }, + { + "epoch": 1.0055710306406684, + "grad_norm": 1.6451672315597534, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.880897045135498, + "num_tokens": 197385319.0, + "step": 5415 + }, + { + "epoch": 1.0057567316620242, + "grad_norm": 1.5235592126846313, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8786107301712036, + "num_tokens": 197420900.0, + "step": 5416 + }, + { + "epoch": 1.0059424326833797, + "grad_norm": 1.6930656433105469, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8822728395462036, + "num_tokens": 197451790.0, + "step": 5417 + }, + { + "epoch": 1.0061281337047354, + "grad_norm": 1.6422680616378784, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8781169652938843, + "num_tokens": 197487781.0, + "step": 5418 + }, + { + "epoch": 1.006313834726091, + "grad_norm": 1.6482471227645874, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8749879598617554, + "num_tokens": 197519603.0, + "step": 5419 + }, + { + "epoch": 1.0064995357474467, + "grad_norm": 1.553921103477478, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8863246440887451, + "num_tokens": 197553784.0, + "step": 5420 + }, + { + "epoch": 1.0066852367688022, + "grad_norm": 1.5338397026062012, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8766499161720276, + "num_tokens": 197590983.0, + "step": 5421 + }, + { + "epoch": 1.006870937790158, + "grad_norm": 1.6249216794967651, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8711625337600708, + "num_tokens": 197627469.0, + "step": 5422 + }, + { + "epoch": 1.0070566388115134, + "grad_norm": 1.564745545387268, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8644610643386841, + "num_tokens": 197664378.0, + "step": 5423 + }, + { + "epoch": 1.0072423398328691, + "grad_norm": 1.7080349922180176, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.88237464427948, + "num_tokens": 197698367.0, + "step": 5424 + }, + { + "epoch": 1.0074280408542247, + "grad_norm": 1.5146945714950562, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8774205446243286, + "num_tokens": 197735140.0, + "step": 5425 + }, + { + "epoch": 1.0076137418755804, + "grad_norm": 1.6183384656906128, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8872534036636353, + "num_tokens": 197770936.0, + "step": 5426 + }, + { + "epoch": 1.007799442896936, + "grad_norm": 1.6120209693908691, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8621758222579956, + "num_tokens": 197806952.0, + "step": 5427 + }, + { + "epoch": 1.0079851439182916, + "grad_norm": 1.5615164041519165, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8849554061889648, + "num_tokens": 197841969.0, + "step": 5428 + }, + { + "epoch": 1.0081708449396471, + "grad_norm": 1.6664632558822632, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8763980269432068, + "num_tokens": 197873163.0, + "step": 5429 + }, + { + "epoch": 1.0083565459610029, + "grad_norm": 1.4937547445297241, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8764315247535706, + "num_tokens": 197912127.0, + "step": 5430 + }, + { + "epoch": 1.0085422469823584, + "grad_norm": 1.6652940511703491, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8697441816329956, + "num_tokens": 197948183.0, + "step": 5431 + }, + { + "epoch": 1.0087279480037141, + "grad_norm": 1.5807642936706543, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.87961745262146, + "num_tokens": 197981985.0, + "step": 5432 + }, + { + "epoch": 1.0089136490250696, + "grad_norm": 1.3894020318984985, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8753955364227295, + "num_tokens": 198025902.0, + "step": 5433 + }, + { + "epoch": 1.0090993500464251, + "grad_norm": 1.6883697509765625, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8738245964050293, + "num_tokens": 198056825.0, + "step": 5434 + }, + { + "epoch": 1.0092850510677809, + "grad_norm": 1.588016390800476, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8711239695549011, + "num_tokens": 198097999.0, + "step": 5435 + }, + { + "epoch": 1.0094707520891364, + "grad_norm": 1.5714631080627441, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8794784545898438, + "num_tokens": 198133474.0, + "step": 5436 + }, + { + "epoch": 1.0096564531104921, + "grad_norm": 1.4573873281478882, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8761337995529175, + "num_tokens": 198175032.0, + "step": 5437 + }, + { + "epoch": 1.0098421541318476, + "grad_norm": 1.4720455408096313, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.881407618522644, + "num_tokens": 198214124.0, + "step": 5438 + }, + { + "epoch": 1.0100278551532034, + "grad_norm": 1.6158912181854248, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8731378316879272, + "num_tokens": 198248512.0, + "step": 5439 + }, + { + "epoch": 1.0102135561745589, + "grad_norm": 1.6633621454238892, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8770405054092407, + "num_tokens": 198279262.0, + "step": 5440 + }, + { + "epoch": 1.0103992571959146, + "grad_norm": 1.6379832029342651, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8711475729942322, + "num_tokens": 198316317.0, + "step": 5441 + }, + { + "epoch": 1.0105849582172701, + "grad_norm": 1.5464221239089966, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8549215793609619, + "num_tokens": 198356888.0, + "step": 5442 + }, + { + "epoch": 1.0107706592386259, + "grad_norm": 1.5287768840789795, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8750426769256592, + "num_tokens": 198396970.0, + "step": 5443 + }, + { + "epoch": 1.0109563602599814, + "grad_norm": 1.6782050132751465, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8652759194374084, + "num_tokens": 198429936.0, + "step": 5444 + }, + { + "epoch": 1.011142061281337, + "grad_norm": 1.6542584896087646, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8730199933052063, + "num_tokens": 198461037.0, + "step": 5445 + }, + { + "epoch": 1.0113277623026926, + "grad_norm": 1.5324307680130005, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.880220890045166, + "num_tokens": 198499506.0, + "step": 5446 + }, + { + "epoch": 1.0115134633240483, + "grad_norm": 1.7627482414245605, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.858396053314209, + "num_tokens": 198533356.0, + "step": 5447 + }, + { + "epoch": 1.0116991643454039, + "grad_norm": 1.5349812507629395, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8799431324005127, + "num_tokens": 198573373.0, + "step": 5448 + }, + { + "epoch": 1.0118848653667596, + "grad_norm": 1.6925877332687378, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8711333870887756, + "num_tokens": 198605850.0, + "step": 5449 + }, + { + "epoch": 1.012070566388115, + "grad_norm": 1.4396106004714966, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8762919902801514, + "num_tokens": 198650454.0, + "step": 5450 + }, + { + "epoch": 1.0122562674094708, + "grad_norm": 1.669033408164978, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8751698732376099, + "num_tokens": 198683450.0, + "step": 5451 + }, + { + "epoch": 1.0124419684308263, + "grad_norm": 1.5471782684326172, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8878603577613831, + "num_tokens": 198721702.0, + "step": 5452 + }, + { + "epoch": 1.012627669452182, + "grad_norm": 1.625732421875, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8674648404121399, + "num_tokens": 198760629.0, + "step": 5453 + }, + { + "epoch": 1.0128133704735376, + "grad_norm": 1.4404715299606323, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.882453441619873, + "num_tokens": 198800957.0, + "step": 5454 + }, + { + "epoch": 1.0129990714948933, + "grad_norm": 1.6959413290023804, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8680469989776611, + "num_tokens": 198834832.0, + "step": 5455 + }, + { + "epoch": 1.0131847725162488, + "grad_norm": 1.645464539527893, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8503454923629761, + "num_tokens": 198874304.0, + "step": 5456 + }, + { + "epoch": 1.0133704735376046, + "grad_norm": 1.717379093170166, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8692775368690491, + "num_tokens": 198908092.0, + "step": 5457 + }, + { + "epoch": 1.01355617455896, + "grad_norm": 1.6342970132827759, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8786877393722534, + "num_tokens": 198945235.0, + "step": 5458 + }, + { + "epoch": 1.0137418755803156, + "grad_norm": 1.689505934715271, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8803852796554565, + "num_tokens": 198981582.0, + "step": 5459 + }, + { + "epoch": 1.0139275766016713, + "grad_norm": 1.5698573589324951, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8652070760726929, + "num_tokens": 199019056.0, + "step": 5460 + }, + { + "epoch": 1.0141132776230268, + "grad_norm": 1.5149818658828735, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8850023746490479, + "num_tokens": 199058003.0, + "step": 5461 + }, + { + "epoch": 1.0142989786443826, + "grad_norm": 1.6488990783691406, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8734903931617737, + "num_tokens": 199089395.0, + "step": 5462 + }, + { + "epoch": 1.014484679665738, + "grad_norm": 1.6316490173339844, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8822815418243408, + "num_tokens": 199122453.0, + "step": 5463 + }, + { + "epoch": 1.0146703806870938, + "grad_norm": 1.6250020265579224, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8693081736564636, + "num_tokens": 199155809.0, + "step": 5464 + }, + { + "epoch": 1.0148560817084493, + "grad_norm": 1.435367465019226, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8800917863845825, + "num_tokens": 199194086.0, + "step": 5465 + }, + { + "epoch": 1.015041782729805, + "grad_norm": 1.4664937257766724, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8676846027374268, + "num_tokens": 199234643.0, + "step": 5466 + }, + { + "epoch": 1.0152274837511606, + "grad_norm": 1.7209997177124023, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8725223541259766, + "num_tokens": 199268666.0, + "step": 5467 + }, + { + "epoch": 1.0154131847725163, + "grad_norm": 1.7068023681640625, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8575049638748169, + "num_tokens": 199302740.0, + "step": 5468 + }, + { + "epoch": 1.0155988857938718, + "grad_norm": 1.7585344314575195, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8700989484786987, + "num_tokens": 199331877.0, + "step": 5469 + }, + { + "epoch": 1.0157845868152275, + "grad_norm": 1.5386053323745728, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8700380921363831, + "num_tokens": 199369714.0, + "step": 5470 + }, + { + "epoch": 1.015970287836583, + "grad_norm": 1.5557535886764526, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8737381100654602, + "num_tokens": 199404098.0, + "step": 5471 + }, + { + "epoch": 1.0161559888579388, + "grad_norm": 1.5659340620040894, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8707255125045776, + "num_tokens": 199440559.0, + "step": 5472 + }, + { + "epoch": 1.0163416898792943, + "grad_norm": 1.6749900579452515, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8777360320091248, + "num_tokens": 199472672.0, + "step": 5473 + }, + { + "epoch": 1.01652739090065, + "grad_norm": 1.555342197418213, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8685237765312195, + "num_tokens": 199512036.0, + "step": 5474 + }, + { + "epoch": 1.0167130919220055, + "grad_norm": 1.4871373176574707, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8757723569869995, + "num_tokens": 199550616.0, + "step": 5475 + }, + { + "epoch": 1.0168987929433613, + "grad_norm": 1.5550856590270996, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.89177405834198, + "num_tokens": 199582384.0, + "step": 5476 + }, + { + "epoch": 1.0170844939647168, + "grad_norm": 1.4832258224487305, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8730424642562866, + "num_tokens": 199621785.0, + "step": 5477 + }, + { + "epoch": 1.0172701949860725, + "grad_norm": 1.564344882965088, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8728686571121216, + "num_tokens": 199660716.0, + "step": 5478 + }, + { + "epoch": 1.017455896007428, + "grad_norm": 1.675917387008667, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8633363246917725, + "num_tokens": 199695402.0, + "step": 5479 + }, + { + "epoch": 1.0176415970287838, + "grad_norm": 1.4731794595718384, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8698341846466064, + "num_tokens": 199735462.0, + "step": 5480 + }, + { + "epoch": 1.0178272980501393, + "grad_norm": 1.4868006706237793, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8789166212081909, + "num_tokens": 199778679.0, + "step": 5481 + }, + { + "epoch": 1.0180129990714948, + "grad_norm": 1.4906806945800781, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8701388835906982, + "num_tokens": 199816876.0, + "step": 5482 + }, + { + "epoch": 1.0181987000928505, + "grad_norm": 1.5486584901809692, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8769386410713196, + "num_tokens": 199855724.0, + "step": 5483 + }, + { + "epoch": 1.018384401114206, + "grad_norm": 1.4335976839065552, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8759081959724426, + "num_tokens": 199895269.0, + "step": 5484 + }, + { + "epoch": 1.0185701021355618, + "grad_norm": 1.5872925519943237, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8744983673095703, + "num_tokens": 199931854.0, + "step": 5485 + }, + { + "epoch": 1.0187558031569173, + "grad_norm": 1.647838830947876, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.876351535320282, + "num_tokens": 199963622.0, + "step": 5486 + }, + { + "epoch": 1.018941504178273, + "grad_norm": 1.4866944551467896, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8816514015197754, + "num_tokens": 200000883.0, + "step": 5487 + }, + { + "epoch": 1.0191272051996285, + "grad_norm": 1.6724438667297363, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8780307769775391, + "num_tokens": 200034432.0, + "step": 5488 + }, + { + "epoch": 1.0193129062209842, + "grad_norm": 1.5297211408615112, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8947386741638184, + "num_tokens": 200067624.0, + "step": 5489 + }, + { + "epoch": 1.0194986072423398, + "grad_norm": 1.6023874282836914, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8670420050621033, + "num_tokens": 200101227.0, + "step": 5490 + }, + { + "epoch": 1.0196843082636955, + "grad_norm": 1.5108363628387451, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.875004768371582, + "num_tokens": 200140324.0, + "step": 5491 + }, + { + "epoch": 1.019870009285051, + "grad_norm": 1.448194980621338, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.8997840881347656, + "num_tokens": 200174925.0, + "step": 5492 + }, + { + "epoch": 1.0200557103064067, + "grad_norm": 1.7052987813949585, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8756272792816162, + "num_tokens": 200209447.0, + "step": 5493 + }, + { + "epoch": 1.0202414113277622, + "grad_norm": 1.5632734298706055, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8736393451690674, + "num_tokens": 200245237.0, + "step": 5494 + }, + { + "epoch": 1.020427112349118, + "grad_norm": 1.4290804862976074, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8790543675422668, + "num_tokens": 200285292.0, + "step": 5495 + }, + { + "epoch": 1.0206128133704735, + "grad_norm": 1.6032582521438599, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8763266205787659, + "num_tokens": 200323359.0, + "step": 5496 + }, + { + "epoch": 1.0207985143918292, + "grad_norm": 1.5919909477233887, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8666015267372131, + "num_tokens": 200360764.0, + "step": 5497 + }, + { + "epoch": 1.0209842154131847, + "grad_norm": 1.8039567470550537, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8718441724777222, + "num_tokens": 200391810.0, + "step": 5498 + }, + { + "epoch": 1.0211699164345405, + "grad_norm": 1.4895262718200684, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8836655616760254, + "num_tokens": 200427120.0, + "step": 5499 + }, + { + "epoch": 1.021355617455896, + "grad_norm": 1.5594183206558228, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8786433935165405, + "num_tokens": 200460948.0, + "step": 5500 + }, + { + "epoch": 1.0215413184772517, + "grad_norm": 1.562830924987793, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8656043410301208, + "num_tokens": 200497694.0, + "step": 5501 + }, + { + "epoch": 1.0217270194986072, + "grad_norm": 1.5494333505630493, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8866506814956665, + "num_tokens": 200532657.0, + "step": 5502 + }, + { + "epoch": 1.021912720519963, + "grad_norm": 1.4760318994522095, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8677347898483276, + "num_tokens": 200572021.0, + "step": 5503 + }, + { + "epoch": 1.0220984215413185, + "grad_norm": 1.4814246892929077, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8730926513671875, + "num_tokens": 200611345.0, + "step": 5504 + }, + { + "epoch": 1.0222841225626742, + "grad_norm": 1.4835513830184937, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.875774621963501, + "num_tokens": 200651788.0, + "step": 5505 + }, + { + "epoch": 1.0224698235840297, + "grad_norm": 1.4181139469146729, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8799443244934082, + "num_tokens": 200693062.0, + "step": 5506 + }, + { + "epoch": 1.0226555246053852, + "grad_norm": 1.4851287603378296, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8709933161735535, + "num_tokens": 200734473.0, + "step": 5507 + }, + { + "epoch": 1.022841225626741, + "grad_norm": 1.569696068763733, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.876798152923584, + "num_tokens": 200769147.0, + "step": 5508 + }, + { + "epoch": 1.0230269266480965, + "grad_norm": 1.7750126123428345, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8593199849128723, + "num_tokens": 200799945.0, + "step": 5509 + }, + { + "epoch": 1.0232126276694522, + "grad_norm": 1.6196290254592896, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8803509473800659, + "num_tokens": 200835457.0, + "step": 5510 + }, + { + "epoch": 1.0233983286908077, + "grad_norm": 1.5285922288894653, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8679619431495667, + "num_tokens": 200873669.0, + "step": 5511 + }, + { + "epoch": 1.0235840297121634, + "grad_norm": 1.5799959897994995, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8840487003326416, + "num_tokens": 200907192.0, + "step": 5512 + }, + { + "epoch": 1.023769730733519, + "grad_norm": 1.56220281124115, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8781400322914124, + "num_tokens": 200944775.0, + "step": 5513 + }, + { + "epoch": 1.0239554317548747, + "grad_norm": 1.4912279844284058, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8692019581794739, + "num_tokens": 200982341.0, + "step": 5514 + }, + { + "epoch": 1.0241411327762302, + "grad_norm": 1.5978858470916748, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8799020648002625, + "num_tokens": 201015256.0, + "step": 5515 + }, + { + "epoch": 1.024326833797586, + "grad_norm": 1.672987461090088, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8646519184112549, + "num_tokens": 201052243.0, + "step": 5516 + }, + { + "epoch": 1.0245125348189414, + "grad_norm": 1.5744593143463135, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8749566674232483, + "num_tokens": 201085421.0, + "step": 5517 + }, + { + "epoch": 1.0246982358402972, + "grad_norm": 1.7245385646820068, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.867435097694397, + "num_tokens": 201116742.0, + "step": 5518 + }, + { + "epoch": 1.0248839368616527, + "grad_norm": 1.4638954401016235, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8694018721580505, + "num_tokens": 201157335.0, + "step": 5519 + }, + { + "epoch": 1.0250696378830084, + "grad_norm": 1.567474365234375, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8723818063735962, + "num_tokens": 201195834.0, + "step": 5520 + }, + { + "epoch": 1.025255338904364, + "grad_norm": 1.4612951278686523, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8704270720481873, + "num_tokens": 201238958.0, + "step": 5521 + }, + { + "epoch": 1.0254410399257197, + "grad_norm": 1.5500167608261108, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.880930483341217, + "num_tokens": 201272334.0, + "step": 5522 + }, + { + "epoch": 1.0256267409470752, + "grad_norm": 1.5898877382278442, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8805224299430847, + "num_tokens": 201305719.0, + "step": 5523 + }, + { + "epoch": 1.025812441968431, + "grad_norm": 1.617984414100647, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8674522638320923, + "num_tokens": 201343569.0, + "step": 5524 + }, + { + "epoch": 1.0259981429897864, + "grad_norm": 1.4741694927215576, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8890587687492371, + "num_tokens": 201380414.0, + "step": 5525 + }, + { + "epoch": 1.0261838440111422, + "grad_norm": 1.597312331199646, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8725553750991821, + "num_tokens": 201415348.0, + "step": 5526 + }, + { + "epoch": 1.0263695450324977, + "grad_norm": 1.5756019353866577, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.87644362449646, + "num_tokens": 201450764.0, + "step": 5527 + }, + { + "epoch": 1.0265552460538534, + "grad_norm": 1.4394705295562744, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8746334314346313, + "num_tokens": 201491483.0, + "step": 5528 + }, + { + "epoch": 1.026740947075209, + "grad_norm": 1.640574336051941, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.865531861782074, + "num_tokens": 201524408.0, + "step": 5529 + }, + { + "epoch": 1.0269266480965644, + "grad_norm": 1.5031793117523193, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8734728097915649, + "num_tokens": 201565959.0, + "step": 5530 + }, + { + "epoch": 1.0271123491179202, + "grad_norm": 1.5033456087112427, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8870309591293335, + "num_tokens": 201599862.0, + "step": 5531 + }, + { + "epoch": 1.0272980501392757, + "grad_norm": 1.5478647947311401, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8787517547607422, + "num_tokens": 201635495.0, + "step": 5532 + }, + { + "epoch": 1.0274837511606314, + "grad_norm": 1.4889899492263794, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8869891166687012, + "num_tokens": 201674055.0, + "step": 5533 + }, + { + "epoch": 1.027669452181987, + "grad_norm": 1.68934965133667, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8710644841194153, + "num_tokens": 201705315.0, + "step": 5534 + }, + { + "epoch": 1.0278551532033426, + "grad_norm": 1.478531002998352, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8715769052505493, + "num_tokens": 201742320.0, + "step": 5535 + }, + { + "epoch": 1.0280408542246982, + "grad_norm": 1.4456578493118286, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8784628510475159, + "num_tokens": 201782373.0, + "step": 5536 + }, + { + "epoch": 1.0282265552460539, + "grad_norm": 1.5121291875839233, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8832049369812012, + "num_tokens": 201818632.0, + "step": 5537 + }, + { + "epoch": 1.0284122562674094, + "grad_norm": 1.6865997314453125, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8738936185836792, + "num_tokens": 201849467.0, + "step": 5538 + }, + { + "epoch": 1.0285979572887651, + "grad_norm": 1.6204572916030884, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8754256963729858, + "num_tokens": 201882376.0, + "step": 5539 + }, + { + "epoch": 1.0287836583101206, + "grad_norm": 1.5166324377059937, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8864947557449341, + "num_tokens": 201915011.0, + "step": 5540 + }, + { + "epoch": 1.0289693593314764, + "grad_norm": 1.3788601160049438, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8925231695175171, + "num_tokens": 201954967.0, + "step": 5541 + }, + { + "epoch": 1.0291550603528319, + "grad_norm": 1.3784598112106323, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8784053325653076, + "num_tokens": 201999321.0, + "step": 5542 + }, + { + "epoch": 1.0293407613741876, + "grad_norm": 1.52992582321167, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8835126161575317, + "num_tokens": 202033889.0, + "step": 5543 + }, + { + "epoch": 1.0295264623955431, + "grad_norm": 1.4772616624832153, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8779825568199158, + "num_tokens": 202071564.0, + "step": 5544 + }, + { + "epoch": 1.0297121634168989, + "grad_norm": 1.650695562362671, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8708133101463318, + "num_tokens": 202105293.0, + "step": 5545 + }, + { + "epoch": 1.0298978644382544, + "grad_norm": 1.4297547340393066, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8713597059249878, + "num_tokens": 202145474.0, + "step": 5546 + }, + { + "epoch": 1.03008356545961, + "grad_norm": 1.539346694946289, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.878845751285553, + "num_tokens": 202186735.0, + "step": 5547 + }, + { + "epoch": 1.0302692664809656, + "grad_norm": 1.509996771812439, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8700673580169678, + "num_tokens": 202225813.0, + "step": 5548 + }, + { + "epoch": 1.0304549675023214, + "grad_norm": 1.3806698322296143, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8765884041786194, + "num_tokens": 202271203.0, + "step": 5549 + }, + { + "epoch": 1.0306406685236769, + "grad_norm": 1.4541516304016113, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8798673152923584, + "num_tokens": 202310040.0, + "step": 5550 + }, + { + "epoch": 1.0308263695450326, + "grad_norm": 1.5940930843353271, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8763355016708374, + "num_tokens": 202348356.0, + "step": 5551 + }, + { + "epoch": 1.031012070566388, + "grad_norm": 1.521729826927185, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8745309114456177, + "num_tokens": 202388189.0, + "step": 5552 + }, + { + "epoch": 1.0311977715877438, + "grad_norm": 1.557898998260498, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8878733515739441, + "num_tokens": 202423684.0, + "step": 5553 + }, + { + "epoch": 1.0313834726090993, + "grad_norm": 1.56083083152771, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8790033459663391, + "num_tokens": 202459173.0, + "step": 5554 + }, + { + "epoch": 1.0315691736304549, + "grad_norm": 1.589247703552246, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8595036268234253, + "num_tokens": 202498845.0, + "step": 5555 + }, + { + "epoch": 1.0317548746518106, + "grad_norm": 1.6377012729644775, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8799039125442505, + "num_tokens": 202533097.0, + "step": 5556 + }, + { + "epoch": 1.031940575673166, + "grad_norm": 1.6298365592956543, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8642655611038208, + "num_tokens": 202566372.0, + "step": 5557 + }, + { + "epoch": 1.0321262766945218, + "grad_norm": 1.6902755498886108, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8660824298858643, + "num_tokens": 202599053.0, + "step": 5558 + }, + { + "epoch": 1.0323119777158773, + "grad_norm": 1.584337592124939, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8757559061050415, + "num_tokens": 202634623.0, + "step": 5559 + }, + { + "epoch": 1.032497678737233, + "grad_norm": 1.6730406284332275, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8815574049949646, + "num_tokens": 202667850.0, + "step": 5560 + }, + { + "epoch": 1.0326833797585886, + "grad_norm": 1.6356607675552368, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8729766011238098, + "num_tokens": 202703440.0, + "step": 5561 + }, + { + "epoch": 1.0328690807799443, + "grad_norm": 1.608322024345398, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8751238584518433, + "num_tokens": 202737963.0, + "step": 5562 + }, + { + "epoch": 1.0330547818012998, + "grad_norm": 1.578709602355957, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8688901662826538, + "num_tokens": 202772570.0, + "step": 5563 + }, + { + "epoch": 1.0332404828226556, + "grad_norm": 1.5220839977264404, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8681034445762634, + "num_tokens": 202812006.0, + "step": 5564 + }, + { + "epoch": 1.033426183844011, + "grad_norm": 1.4859044551849365, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8818404078483582, + "num_tokens": 202847843.0, + "step": 5565 + }, + { + "epoch": 1.0336118848653668, + "grad_norm": 1.531447410583496, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8771510124206543, + "num_tokens": 202884336.0, + "step": 5566 + }, + { + "epoch": 1.0337975858867223, + "grad_norm": 1.6387662887573242, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8569583892822266, + "num_tokens": 202918823.0, + "step": 5567 + }, + { + "epoch": 1.033983286908078, + "grad_norm": 1.7146668434143066, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.870052695274353, + "num_tokens": 202949346.0, + "step": 5568 + }, + { + "epoch": 1.0341689879294336, + "grad_norm": 1.4763870239257812, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8863904476165771, + "num_tokens": 202987768.0, + "step": 5569 + }, + { + "epoch": 1.0343546889507893, + "grad_norm": 1.6325998306274414, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8778313398361206, + "num_tokens": 203018932.0, + "step": 5570 + }, + { + "epoch": 1.0345403899721448, + "grad_norm": 1.5427193641662598, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8839477300643921, + "num_tokens": 203053849.0, + "step": 5571 + }, + { + "epoch": 1.0347260909935005, + "grad_norm": 1.6733636856079102, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8574261665344238, + "num_tokens": 203088714.0, + "step": 5572 + }, + { + "epoch": 1.034911792014856, + "grad_norm": 1.6218769550323486, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8860554695129395, + "num_tokens": 203120678.0, + "step": 5573 + }, + { + "epoch": 1.0350974930362118, + "grad_norm": 1.5339694023132324, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8678529858589172, + "num_tokens": 203157525.0, + "step": 5574 + }, + { + "epoch": 1.0352831940575673, + "grad_norm": 1.4887183904647827, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8831335306167603, + "num_tokens": 203196260.0, + "step": 5575 + }, + { + "epoch": 1.035468895078923, + "grad_norm": 1.498948097229004, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.873479962348938, + "num_tokens": 203237040.0, + "step": 5576 + }, + { + "epoch": 1.0356545961002785, + "grad_norm": 1.5949034690856934, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8843857645988464, + "num_tokens": 203271608.0, + "step": 5577 + }, + { + "epoch": 1.0358402971216343, + "grad_norm": 1.6104077100753784, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8828490376472473, + "num_tokens": 203303513.0, + "step": 5578 + }, + { + "epoch": 1.0360259981429898, + "grad_norm": 1.6846321821212769, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8562009334564209, + "num_tokens": 203337539.0, + "step": 5579 + }, + { + "epoch": 1.0362116991643453, + "grad_norm": 1.5116742849349976, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8726973533630371, + "num_tokens": 203379257.0, + "step": 5580 + }, + { + "epoch": 1.036397400185701, + "grad_norm": 1.440557837486267, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8824203014373779, + "num_tokens": 203419179.0, + "step": 5581 + }, + { + "epoch": 1.0365831012070565, + "grad_norm": 1.7204623222351074, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8504787683486938, + "num_tokens": 203456759.0, + "step": 5582 + }, + { + "epoch": 1.0367688022284123, + "grad_norm": 1.586064338684082, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8782442212104797, + "num_tokens": 203490526.0, + "step": 5583 + }, + { + "epoch": 1.0369545032497678, + "grad_norm": 1.5245285034179688, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8799376487731934, + "num_tokens": 203529346.0, + "step": 5584 + }, + { + "epoch": 1.0371402042711235, + "grad_norm": 1.6207695007324219, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8725840449333191, + "num_tokens": 203565536.0, + "step": 5585 + }, + { + "epoch": 1.037325905292479, + "grad_norm": 1.4074676036834717, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8891281485557556, + "num_tokens": 203604294.0, + "step": 5586 + }, + { + "epoch": 1.0375116063138348, + "grad_norm": 1.519037127494812, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8789637684822083, + "num_tokens": 203640271.0, + "step": 5587 + }, + { + "epoch": 1.0376973073351903, + "grad_norm": 1.5232232809066772, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8842218518257141, + "num_tokens": 203676773.0, + "step": 5588 + }, + { + "epoch": 1.037883008356546, + "grad_norm": 1.5080853700637817, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8617750406265259, + "num_tokens": 203716827.0, + "step": 5589 + }, + { + "epoch": 1.0380687093779015, + "grad_norm": 1.5078785419464111, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.878007173538208, + "num_tokens": 203755594.0, + "step": 5590 + }, + { + "epoch": 1.0382544103992573, + "grad_norm": 1.3877791166305542, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8799632787704468, + "num_tokens": 203795451.0, + "step": 5591 + }, + { + "epoch": 1.0384401114206128, + "grad_norm": 1.5402965545654297, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8907097578048706, + "num_tokens": 203832901.0, + "step": 5592 + }, + { + "epoch": 1.0386258124419685, + "grad_norm": 1.5407851934432983, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8810142278671265, + "num_tokens": 203866990.0, + "step": 5593 + }, + { + "epoch": 1.038811513463324, + "grad_norm": 1.7782784700393677, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8650360107421875, + "num_tokens": 203902535.0, + "step": 5594 + }, + { + "epoch": 1.0389972144846797, + "grad_norm": 1.56001615524292, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8769972324371338, + "num_tokens": 203941568.0, + "step": 5595 + }, + { + "epoch": 1.0391829155060353, + "grad_norm": 1.582351803779602, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8805595636367798, + "num_tokens": 203976898.0, + "step": 5596 + }, + { + "epoch": 1.039368616527391, + "grad_norm": 1.6076552867889404, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8801140189170837, + "num_tokens": 204008164.0, + "step": 5597 + }, + { + "epoch": 1.0395543175487465, + "grad_norm": 1.7915587425231934, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8834850192070007, + "num_tokens": 204038070.0, + "step": 5598 + }, + { + "epoch": 1.0397400185701022, + "grad_norm": 1.448306918144226, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8850709199905396, + "num_tokens": 204074718.0, + "step": 5599 + }, + { + "epoch": 1.0399257195914577, + "grad_norm": 1.4459595680236816, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8890097141265869, + "num_tokens": 204114761.0, + "step": 5600 + }, + { + "epoch": 1.0401114206128135, + "grad_norm": 1.3426827192306519, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8901189565658569, + "num_tokens": 204158564.0, + "step": 5601 + }, + { + "epoch": 1.040297121634169, + "grad_norm": 1.5458884239196777, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8654649257659912, + "num_tokens": 204197473.0, + "step": 5602 + }, + { + "epoch": 1.0404828226555245, + "grad_norm": 1.6884727478027344, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8815531134605408, + "num_tokens": 204226386.0, + "step": 5603 + }, + { + "epoch": 1.0406685236768802, + "grad_norm": 1.4489377737045288, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8765147924423218, + "num_tokens": 204267754.0, + "step": 5604 + }, + { + "epoch": 1.0408542246982357, + "grad_norm": 1.5607173442840576, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8742026090621948, + "num_tokens": 204304813.0, + "step": 5605 + }, + { + "epoch": 1.0410399257195915, + "grad_norm": 1.6334036588668823, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8771699070930481, + "num_tokens": 204335707.0, + "step": 5606 + }, + { + "epoch": 1.041225626740947, + "grad_norm": 1.7961074113845825, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8753914833068848, + "num_tokens": 204367503.0, + "step": 5607 + }, + { + "epoch": 1.0414113277623027, + "grad_norm": 1.6225826740264893, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8834632039070129, + "num_tokens": 204403798.0, + "step": 5608 + }, + { + "epoch": 1.0415970287836582, + "grad_norm": 1.467577576637268, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8874947428703308, + "num_tokens": 204440304.0, + "step": 5609 + }, + { + "epoch": 1.041782729805014, + "grad_norm": 1.4170880317687988, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8909432888031006, + "num_tokens": 204479765.0, + "step": 5610 + }, + { + "epoch": 1.0419684308263695, + "grad_norm": 1.5706377029418945, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8793931007385254, + "num_tokens": 204513934.0, + "step": 5611 + }, + { + "epoch": 1.0421541318477252, + "grad_norm": 1.7932366132736206, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8699688911437988, + "num_tokens": 204550107.0, + "step": 5612 + }, + { + "epoch": 1.0423398328690807, + "grad_norm": 1.6176811456680298, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8574268221855164, + "num_tokens": 204587149.0, + "step": 5613 + }, + { + "epoch": 1.0425255338904365, + "grad_norm": 1.4870513677597046, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8598990440368652, + "num_tokens": 204628769.0, + "step": 5614 + }, + { + "epoch": 1.042711234911792, + "grad_norm": 1.6213150024414062, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8710923790931702, + "num_tokens": 204667402.0, + "step": 5615 + }, + { + "epoch": 1.0428969359331477, + "grad_norm": 1.4939899444580078, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8770792484283447, + "num_tokens": 204704261.0, + "step": 5616 + }, + { + "epoch": 1.0430826369545032, + "grad_norm": 1.6810702085494995, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8756392002105713, + "num_tokens": 204735205.0, + "step": 5617 + }, + { + "epoch": 1.043268337975859, + "grad_norm": 1.7319557666778564, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8720836639404297, + "num_tokens": 204765893.0, + "step": 5618 + }, + { + "epoch": 1.0434540389972145, + "grad_norm": 1.713753342628479, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8617716431617737, + "num_tokens": 204801941.0, + "step": 5619 + }, + { + "epoch": 1.0436397400185702, + "grad_norm": 1.5907357931137085, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.885390043258667, + "num_tokens": 204836504.0, + "step": 5620 + }, + { + "epoch": 1.0438254410399257, + "grad_norm": 1.655258297920227, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8645554780960083, + "num_tokens": 204870550.0, + "step": 5621 + }, + { + "epoch": 1.0440111420612814, + "grad_norm": 1.6530145406723022, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8684589862823486, + "num_tokens": 204904004.0, + "step": 5622 + }, + { + "epoch": 1.044196843082637, + "grad_norm": 1.6232457160949707, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8648261427879333, + "num_tokens": 204944346.0, + "step": 5623 + }, + { + "epoch": 1.0443825441039927, + "grad_norm": 1.6452184915542603, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.872975766658783, + "num_tokens": 204976838.0, + "step": 5624 + }, + { + "epoch": 1.0445682451253482, + "grad_norm": 1.5319483280181885, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8691655397415161, + "num_tokens": 205016892.0, + "step": 5625 + }, + { + "epoch": 1.0447539461467037, + "grad_norm": 1.535565972328186, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8639503121376038, + "num_tokens": 205055668.0, + "step": 5626 + }, + { + "epoch": 1.0449396471680594, + "grad_norm": 1.5820693969726562, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.87535160779953, + "num_tokens": 205093518.0, + "step": 5627 + }, + { + "epoch": 1.045125348189415, + "grad_norm": 1.424308180809021, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8887421488761902, + "num_tokens": 205132178.0, + "step": 5628 + }, + { + "epoch": 1.0453110492107707, + "grad_norm": 1.6023755073547363, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8787136077880859, + "num_tokens": 205165491.0, + "step": 5629 + }, + { + "epoch": 1.0454967502321262, + "grad_norm": 1.5321639776229858, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8742173910140991, + "num_tokens": 205200731.0, + "step": 5630 + }, + { + "epoch": 1.045682451253482, + "grad_norm": 1.5828700065612793, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8844122886657715, + "num_tokens": 205232061.0, + "step": 5631 + }, + { + "epoch": 1.0458681522748374, + "grad_norm": 1.4804494380950928, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8726544380187988, + "num_tokens": 205273233.0, + "step": 5632 + }, + { + "epoch": 1.0460538532961932, + "grad_norm": 1.770907998085022, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8580589294433594, + "num_tokens": 205313513.0, + "step": 5633 + }, + { + "epoch": 1.0462395543175487, + "grad_norm": 1.4294828176498413, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8894405364990234, + "num_tokens": 205352086.0, + "step": 5634 + }, + { + "epoch": 1.0464252553389044, + "grad_norm": 1.755181074142456, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8719263672828674, + "num_tokens": 205381298.0, + "step": 5635 + }, + { + "epoch": 1.04661095636026, + "grad_norm": 1.5023808479309082, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.866180956363678, + "num_tokens": 205419971.0, + "step": 5636 + }, + { + "epoch": 1.0467966573816156, + "grad_norm": 1.6896699666976929, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8760151863098145, + "num_tokens": 205453188.0, + "step": 5637 + }, + { + "epoch": 1.0469823584029712, + "grad_norm": 1.5857058763504028, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8797643780708313, + "num_tokens": 205486541.0, + "step": 5638 + }, + { + "epoch": 1.047168059424327, + "grad_norm": 1.5107383728027344, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8921717405319214, + "num_tokens": 205523271.0, + "step": 5639 + }, + { + "epoch": 1.0473537604456824, + "grad_norm": 1.7075363397598267, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8639082908630371, + "num_tokens": 205557252.0, + "step": 5640 + }, + { + "epoch": 1.0475394614670381, + "grad_norm": 1.7089359760284424, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8615906238555908, + "num_tokens": 205591088.0, + "step": 5641 + }, + { + "epoch": 1.0477251624883936, + "grad_norm": 1.4660327434539795, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8721817135810852, + "num_tokens": 205632394.0, + "step": 5642 + }, + { + "epoch": 1.0479108635097494, + "grad_norm": 1.4604277610778809, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8940337896347046, + "num_tokens": 205668600.0, + "step": 5643 + }, + { + "epoch": 1.048096564531105, + "grad_norm": 1.6209254264831543, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8703174591064453, + "num_tokens": 205702505.0, + "step": 5644 + }, + { + "epoch": 1.0482822655524606, + "grad_norm": 1.4713962078094482, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.882216215133667, + "num_tokens": 205741108.0, + "step": 5645 + }, + { + "epoch": 1.0484679665738161, + "grad_norm": 1.5219899415969849, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8790796995162964, + "num_tokens": 205776277.0, + "step": 5646 + }, + { + "epoch": 1.0486536675951719, + "grad_norm": 1.6149123907089233, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8494279384613037, + "num_tokens": 205814407.0, + "step": 5647 + }, + { + "epoch": 1.0488393686165274, + "grad_norm": 1.5500802993774414, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8739728927612305, + "num_tokens": 205852911.0, + "step": 5648 + }, + { + "epoch": 1.0490250696378831, + "grad_norm": 1.5523265600204468, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8778913617134094, + "num_tokens": 205889251.0, + "step": 5649 + }, + { + "epoch": 1.0492107706592386, + "grad_norm": 1.34617018699646, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8700776100158691, + "num_tokens": 205934274.0, + "step": 5650 + }, + { + "epoch": 1.0493964716805944, + "grad_norm": 1.6949818134307861, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.883873701095581, + "num_tokens": 205963827.0, + "step": 5651 + }, + { + "epoch": 1.0495821727019499, + "grad_norm": 1.5353528261184692, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8808996677398682, + "num_tokens": 205998833.0, + "step": 5652 + }, + { + "epoch": 1.0497678737233054, + "grad_norm": 1.4993001222610474, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.876213550567627, + "num_tokens": 206036765.0, + "step": 5653 + }, + { + "epoch": 1.0499535747446611, + "grad_norm": 1.4349877834320068, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8785929083824158, + "num_tokens": 206074559.0, + "step": 5654 + }, + { + "epoch": 1.0501392757660166, + "grad_norm": 1.7544676065444946, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8872708082199097, + "num_tokens": 206102869.0, + "step": 5655 + }, + { + "epoch": 1.0503249767873724, + "grad_norm": 1.5572543144226074, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8756657242774963, + "num_tokens": 206138401.0, + "step": 5656 + }, + { + "epoch": 1.0505106778087279, + "grad_norm": 1.4316602945327759, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8742678165435791, + "num_tokens": 206179622.0, + "step": 5657 + }, + { + "epoch": 1.0506963788300836, + "grad_norm": 1.4649381637573242, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.885895311832428, + "num_tokens": 206215579.0, + "step": 5658 + }, + { + "epoch": 1.050882079851439, + "grad_norm": 1.5198709964752197, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8696349263191223, + "num_tokens": 206256839.0, + "step": 5659 + }, + { + "epoch": 1.0510677808727948, + "grad_norm": 1.591052532196045, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.875030517578125, + "num_tokens": 206292261.0, + "step": 5660 + }, + { + "epoch": 1.0512534818941504, + "grad_norm": 1.578495740890503, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8876394033432007, + "num_tokens": 206325620.0, + "step": 5661 + }, + { + "epoch": 1.051439182915506, + "grad_norm": 1.5331984758377075, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8885070085525513, + "num_tokens": 206361817.0, + "step": 5662 + }, + { + "epoch": 1.0516248839368616, + "grad_norm": 1.7142620086669922, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8598783016204834, + "num_tokens": 206397402.0, + "step": 5663 + }, + { + "epoch": 1.0518105849582173, + "grad_norm": 1.6369708776474, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8714758157730103, + "num_tokens": 206429137.0, + "step": 5664 + }, + { + "epoch": 1.0519962859795728, + "grad_norm": 1.5321825742721558, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8755583763122559, + "num_tokens": 206467907.0, + "step": 5665 + }, + { + "epoch": 1.0521819870009286, + "grad_norm": 1.6977471113204956, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.888330340385437, + "num_tokens": 206496626.0, + "step": 5666 + }, + { + "epoch": 1.052367688022284, + "grad_norm": 1.599830150604248, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8753839731216431, + "num_tokens": 206532623.0, + "step": 5667 + }, + { + "epoch": 1.0525533890436398, + "grad_norm": 1.5297064781188965, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8706372976303101, + "num_tokens": 206568303.0, + "step": 5668 + }, + { + "epoch": 1.0527390900649953, + "grad_norm": 1.569823145866394, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.850102424621582, + "num_tokens": 206608551.0, + "step": 5669 + }, + { + "epoch": 1.052924791086351, + "grad_norm": 1.4732919931411743, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8845040798187256, + "num_tokens": 206646997.0, + "step": 5670 + }, + { + "epoch": 1.0531104921077066, + "grad_norm": 1.5086286067962646, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8800631761550903, + "num_tokens": 206683240.0, + "step": 5671 + }, + { + "epoch": 1.0532961931290623, + "grad_norm": 1.5909713506698608, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8619948625564575, + "num_tokens": 206719700.0, + "step": 5672 + }, + { + "epoch": 1.0534818941504178, + "grad_norm": 1.48130202293396, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8672200441360474, + "num_tokens": 206758525.0, + "step": 5673 + }, + { + "epoch": 1.0536675951717736, + "grad_norm": 1.8927767276763916, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.879619836807251, + "num_tokens": 206784127.0, + "step": 5674 + }, + { + "epoch": 1.053853296193129, + "grad_norm": 1.4192464351654053, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8925430774688721, + "num_tokens": 206821289.0, + "step": 5675 + }, + { + "epoch": 1.0540389972144846, + "grad_norm": 1.4537432193756104, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.86905837059021, + "num_tokens": 206861002.0, + "step": 5676 + }, + { + "epoch": 1.0542246982358403, + "grad_norm": 1.7400048971176147, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8783220052719116, + "num_tokens": 206892222.0, + "step": 5677 + }, + { + "epoch": 1.0544103992571958, + "grad_norm": 1.69443678855896, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8582124710083008, + "num_tokens": 206925459.0, + "step": 5678 + }, + { + "epoch": 1.0545961002785516, + "grad_norm": 1.686721920967102, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8692249059677124, + "num_tokens": 206958739.0, + "step": 5679 + }, + { + "epoch": 1.054781801299907, + "grad_norm": 1.5313032865524292, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.871677041053772, + "num_tokens": 207000201.0, + "step": 5680 + }, + { + "epoch": 1.0549675023212628, + "grad_norm": 1.5581049919128418, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8750154972076416, + "num_tokens": 207036776.0, + "step": 5681 + }, + { + "epoch": 1.0551532033426183, + "grad_norm": 1.5807809829711914, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8666096329689026, + "num_tokens": 207075476.0, + "step": 5682 + }, + { + "epoch": 1.055338904363974, + "grad_norm": 1.5613930225372314, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8632475137710571, + "num_tokens": 207114216.0, + "step": 5683 + }, + { + "epoch": 1.0555246053853296, + "grad_norm": 1.6413828134536743, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8672742247581482, + "num_tokens": 207147659.0, + "step": 5684 + }, + { + "epoch": 1.0557103064066853, + "grad_norm": 1.428632378578186, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8877626657485962, + "num_tokens": 207188152.0, + "step": 5685 + }, + { + "epoch": 1.0558960074280408, + "grad_norm": 1.4937796592712402, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.877804696559906, + "num_tokens": 207228106.0, + "step": 5686 + }, + { + "epoch": 1.0560817084493965, + "grad_norm": 1.5737037658691406, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8716434240341187, + "num_tokens": 207266199.0, + "step": 5687 + }, + { + "epoch": 1.056267409470752, + "grad_norm": 1.5262095928192139, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8742767572402954, + "num_tokens": 207305601.0, + "step": 5688 + }, + { + "epoch": 1.0564531104921078, + "grad_norm": 1.5756713151931763, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8741751909255981, + "num_tokens": 207341751.0, + "step": 5689 + }, + { + "epoch": 1.0566388115134633, + "grad_norm": 1.498016119003296, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8723093867301941, + "num_tokens": 207379998.0, + "step": 5690 + }, + { + "epoch": 1.056824512534819, + "grad_norm": 1.3973854780197144, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8771370649337769, + "num_tokens": 207421330.0, + "step": 5691 + }, + { + "epoch": 1.0570102135561745, + "grad_norm": 1.4487191438674927, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8734275102615356, + "num_tokens": 207459683.0, + "step": 5692 + }, + { + "epoch": 1.0571959145775303, + "grad_norm": 1.6099790334701538, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8640716671943665, + "num_tokens": 207497481.0, + "step": 5693 + }, + { + "epoch": 1.0573816155988858, + "grad_norm": 1.5118401050567627, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8861539959907532, + "num_tokens": 207532262.0, + "step": 5694 + }, + { + "epoch": 1.0575673166202415, + "grad_norm": 1.5240672826766968, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8776537775993347, + "num_tokens": 207570863.0, + "step": 5695 + }, + { + "epoch": 1.057753017641597, + "grad_norm": 1.5059483051300049, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8757456541061401, + "num_tokens": 207612128.0, + "step": 5696 + }, + { + "epoch": 1.0579387186629527, + "grad_norm": 1.6949419975280762, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.862019419670105, + "num_tokens": 207644829.0, + "step": 5697 + }, + { + "epoch": 1.0581244196843083, + "grad_norm": 1.6546612977981567, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8758887052536011, + "num_tokens": 207683053.0, + "step": 5698 + }, + { + "epoch": 1.0583101207056638, + "grad_norm": 1.6383440494537354, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8629466891288757, + "num_tokens": 207718712.0, + "step": 5699 + }, + { + "epoch": 1.0584958217270195, + "grad_norm": 1.5550816059112549, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8750573992729187, + "num_tokens": 207753601.0, + "step": 5700 + }, + { + "epoch": 1.058681522748375, + "grad_norm": 1.4649806022644043, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8749452829360962, + "num_tokens": 207794309.0, + "step": 5701 + }, + { + "epoch": 1.0588672237697307, + "grad_norm": 1.5915173292160034, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8777999877929688, + "num_tokens": 207826686.0, + "step": 5702 + }, + { + "epoch": 1.0590529247910863, + "grad_norm": 1.6137193441390991, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.876994788646698, + "num_tokens": 207862442.0, + "step": 5703 + }, + { + "epoch": 1.059238625812442, + "grad_norm": 1.6187795400619507, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8622322678565979, + "num_tokens": 207900502.0, + "step": 5704 + }, + { + "epoch": 1.0594243268337975, + "grad_norm": 1.600780725479126, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8641631603240967, + "num_tokens": 207936267.0, + "step": 5705 + }, + { + "epoch": 1.0596100278551532, + "grad_norm": 1.525221586227417, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8809818029403687, + "num_tokens": 207971914.0, + "step": 5706 + }, + { + "epoch": 1.0597957288765087, + "grad_norm": 1.5301575660705566, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8841042518615723, + "num_tokens": 208008327.0, + "step": 5707 + }, + { + "epoch": 1.0599814298978645, + "grad_norm": 1.689386248588562, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8874654769897461, + "num_tokens": 208039737.0, + "step": 5708 + }, + { + "epoch": 1.06016713091922, + "grad_norm": 1.4593393802642822, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8745977878570557, + "num_tokens": 208081603.0, + "step": 5709 + }, + { + "epoch": 1.0603528319405757, + "grad_norm": 1.5954607725143433, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8781059980392456, + "num_tokens": 208116948.0, + "step": 5710 + }, + { + "epoch": 1.0605385329619312, + "grad_norm": 1.4346424341201782, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8820338249206543, + "num_tokens": 208157661.0, + "step": 5711 + }, + { + "epoch": 1.060724233983287, + "grad_norm": 1.4995261430740356, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.882983386516571, + "num_tokens": 208194245.0, + "step": 5712 + }, + { + "epoch": 1.0609099350046425, + "grad_norm": 1.524040699005127, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8567327260971069, + "num_tokens": 208235900.0, + "step": 5713 + }, + { + "epoch": 1.0610956360259982, + "grad_norm": 1.4662314653396606, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8826054930686951, + "num_tokens": 208274622.0, + "step": 5714 + }, + { + "epoch": 1.0612813370473537, + "grad_norm": 1.6332107782363892, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8729075193405151, + "num_tokens": 208311094.0, + "step": 5715 + }, + { + "epoch": 1.0614670380687095, + "grad_norm": 1.473362684249878, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8837031126022339, + "num_tokens": 208347259.0, + "step": 5716 + }, + { + "epoch": 1.061652739090065, + "grad_norm": 1.4901801347732544, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.890207827091217, + "num_tokens": 208384175.0, + "step": 5717 + }, + { + "epoch": 1.0618384401114207, + "grad_norm": 1.6653205156326294, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8779770135879517, + "num_tokens": 208413957.0, + "step": 5718 + }, + { + "epoch": 1.0620241411327762, + "grad_norm": 1.663487195968628, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8583706617355347, + "num_tokens": 208450469.0, + "step": 5719 + }, + { + "epoch": 1.062209842154132, + "grad_norm": 1.6537060737609863, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8894028663635254, + "num_tokens": 208482175.0, + "step": 5720 + }, + { + "epoch": 1.0623955431754875, + "grad_norm": 1.7076505422592163, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8695505857467651, + "num_tokens": 208519052.0, + "step": 5721 + }, + { + "epoch": 1.062581244196843, + "grad_norm": 1.575632095336914, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8673118948936462, + "num_tokens": 208557974.0, + "step": 5722 + }, + { + "epoch": 1.0627669452181987, + "grad_norm": 1.5713136196136475, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8699773550033569, + "num_tokens": 208595378.0, + "step": 5723 + }, + { + "epoch": 1.0629526462395544, + "grad_norm": 1.6224597692489624, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.876266360282898, + "num_tokens": 208632149.0, + "step": 5724 + }, + { + "epoch": 1.06313834726091, + "grad_norm": 1.4957386255264282, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8943885564804077, + "num_tokens": 208667195.0, + "step": 5725 + }, + { + "epoch": 1.0633240482822655, + "grad_norm": 1.6334785223007202, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8770202398300171, + "num_tokens": 208701843.0, + "step": 5726 + }, + { + "epoch": 1.0635097493036212, + "grad_norm": 1.5750972032546997, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8914088010787964, + "num_tokens": 208736291.0, + "step": 5727 + }, + { + "epoch": 1.0636954503249767, + "grad_norm": 1.5055744647979736, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8787658214569092, + "num_tokens": 208772141.0, + "step": 5728 + }, + { + "epoch": 1.0638811513463324, + "grad_norm": 1.410746693611145, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8788739442825317, + "num_tokens": 208811745.0, + "step": 5729 + }, + { + "epoch": 1.064066852367688, + "grad_norm": 1.6379916667938232, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8740423917770386, + "num_tokens": 208846007.0, + "step": 5730 + }, + { + "epoch": 1.0642525533890437, + "grad_norm": 1.6043591499328613, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8849391937255859, + "num_tokens": 208877512.0, + "step": 5731 + }, + { + "epoch": 1.0644382544103992, + "grad_norm": 1.5086848735809326, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8647565841674805, + "num_tokens": 208919794.0, + "step": 5732 + }, + { + "epoch": 1.064623955431755, + "grad_norm": 1.589450716972351, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8761439323425293, + "num_tokens": 208955317.0, + "step": 5733 + }, + { + "epoch": 1.0648096564531104, + "grad_norm": 1.5759327411651611, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8856767416000366, + "num_tokens": 208990522.0, + "step": 5734 + }, + { + "epoch": 1.0649953574744662, + "grad_norm": 1.661958932876587, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.870693564414978, + "num_tokens": 209024636.0, + "step": 5735 + }, + { + "epoch": 1.0651810584958217, + "grad_norm": 1.634708046913147, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8718395829200745, + "num_tokens": 209058079.0, + "step": 5736 + }, + { + "epoch": 1.0653667595171774, + "grad_norm": 1.4335628747940063, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8741810321807861, + "num_tokens": 209098176.0, + "step": 5737 + }, + { + "epoch": 1.065552460538533, + "grad_norm": 1.5104597806930542, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8771772384643555, + "num_tokens": 209136679.0, + "step": 5738 + }, + { + "epoch": 1.0657381615598887, + "grad_norm": 1.478316307067871, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8895174264907837, + "num_tokens": 209176914.0, + "step": 5739 + }, + { + "epoch": 1.0659238625812442, + "grad_norm": 1.5442417860031128, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8779035806655884, + "num_tokens": 209214152.0, + "step": 5740 + }, + { + "epoch": 1.0661095636026, + "grad_norm": 1.5762319564819336, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8849048614501953, + "num_tokens": 209253909.0, + "step": 5741 + }, + { + "epoch": 1.0662952646239554, + "grad_norm": 1.5320578813552856, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8770923614501953, + "num_tokens": 209296021.0, + "step": 5742 + }, + { + "epoch": 1.0664809656453111, + "grad_norm": 1.513092279434204, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8772435188293457, + "num_tokens": 209335607.0, + "step": 5743 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.4908151626586914, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8873500823974609, + "num_tokens": 209372671.0, + "step": 5744 + }, + { + "epoch": 1.0668523676880224, + "grad_norm": 1.5801990032196045, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8828817009925842, + "num_tokens": 209410069.0, + "step": 5745 + }, + { + "epoch": 1.067038068709378, + "grad_norm": 1.902984619140625, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8667067885398865, + "num_tokens": 209440844.0, + "step": 5746 + }, + { + "epoch": 1.0672237697307336, + "grad_norm": 1.9013644456863403, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.881659984588623, + "num_tokens": 209468891.0, + "step": 5747 + }, + { + "epoch": 1.0674094707520891, + "grad_norm": 1.6037777662277222, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8641853332519531, + "num_tokens": 209505279.0, + "step": 5748 + }, + { + "epoch": 1.0675951717734447, + "grad_norm": 1.603691577911377, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8839232921600342, + "num_tokens": 209540866.0, + "step": 5749 + }, + { + "epoch": 1.0677808727948004, + "grad_norm": 1.6147620677947998, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8747696876525879, + "num_tokens": 209572988.0, + "step": 5750 + }, + { + "epoch": 1.067966573816156, + "grad_norm": 1.6087875366210938, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8831152319908142, + "num_tokens": 209608221.0, + "step": 5751 + }, + { + "epoch": 1.0681522748375116, + "grad_norm": 1.5376957654953003, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8781459331512451, + "num_tokens": 209643736.0, + "step": 5752 + }, + { + "epoch": 1.0683379758588671, + "grad_norm": 1.6209479570388794, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8725558519363403, + "num_tokens": 209678103.0, + "step": 5753 + }, + { + "epoch": 1.0685236768802229, + "grad_norm": 1.6436893939971924, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8754160404205322, + "num_tokens": 209711923.0, + "step": 5754 + }, + { + "epoch": 1.0687093779015784, + "grad_norm": 1.5605827569961548, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8676398992538452, + "num_tokens": 209746946.0, + "step": 5755 + }, + { + "epoch": 1.0688950789229341, + "grad_norm": 1.5920759439468384, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.863924503326416, + "num_tokens": 209784966.0, + "step": 5756 + }, + { + "epoch": 1.0690807799442896, + "grad_norm": 1.557572603225708, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8769079446792603, + "num_tokens": 209824543.0, + "step": 5757 + }, + { + "epoch": 1.0692664809656454, + "grad_norm": 1.5902023315429688, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8775386810302734, + "num_tokens": 209860842.0, + "step": 5758 + }, + { + "epoch": 1.0694521819870009, + "grad_norm": 1.5023020505905151, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8667190670967102, + "num_tokens": 209898944.0, + "step": 5759 + }, + { + "epoch": 1.0696378830083566, + "grad_norm": 1.4700279235839844, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8846288323402405, + "num_tokens": 209936650.0, + "step": 5760 + }, + { + "epoch": 1.0698235840297121, + "grad_norm": 1.649735450744629, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8664411306381226, + "num_tokens": 209970706.0, + "step": 5761 + }, + { + "epoch": 1.0700092850510678, + "grad_norm": 1.6885735988616943, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8652054071426392, + "num_tokens": 210000945.0, + "step": 5762 + }, + { + "epoch": 1.0701949860724234, + "grad_norm": 1.613795518875122, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8663327097892761, + "num_tokens": 210038142.0, + "step": 5763 + }, + { + "epoch": 1.070380687093779, + "grad_norm": 1.5023095607757568, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8819834589958191, + "num_tokens": 210076448.0, + "step": 5764 + }, + { + "epoch": 1.0705663881151346, + "grad_norm": 1.549511432647705, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8692033886909485, + "num_tokens": 210116545.0, + "step": 5765 + }, + { + "epoch": 1.0707520891364903, + "grad_norm": 1.6795722246170044, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8645254373550415, + "num_tokens": 210150005.0, + "step": 5766 + }, + { + "epoch": 1.0709377901578458, + "grad_norm": 1.5434366464614868, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8652178645133972, + "num_tokens": 210190136.0, + "step": 5767 + }, + { + "epoch": 1.0711234911792016, + "grad_norm": 1.6346725225448608, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8798755407333374, + "num_tokens": 210222985.0, + "step": 5768 + }, + { + "epoch": 1.071309192200557, + "grad_norm": 1.4912937879562378, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8732167482376099, + "num_tokens": 210262849.0, + "step": 5769 + }, + { + "epoch": 1.0714948932219128, + "grad_norm": 1.451985239982605, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.888043999671936, + "num_tokens": 210299605.0, + "step": 5770 + }, + { + "epoch": 1.0716805942432683, + "grad_norm": 1.4898087978363037, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8788338899612427, + "num_tokens": 210338616.0, + "step": 5771 + }, + { + "epoch": 1.0718662952646238, + "grad_norm": 1.6448626518249512, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8749383091926575, + "num_tokens": 210372185.0, + "step": 5772 + }, + { + "epoch": 1.0720519962859796, + "grad_norm": 1.6072819232940674, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8907256126403809, + "num_tokens": 210403421.0, + "step": 5773 + }, + { + "epoch": 1.072237697307335, + "grad_norm": 1.7176363468170166, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8736084699630737, + "num_tokens": 210438092.0, + "step": 5774 + }, + { + "epoch": 1.0724233983286908, + "grad_norm": 1.549986481666565, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8775955438613892, + "num_tokens": 210475450.0, + "step": 5775 + }, + { + "epoch": 1.0726090993500463, + "grad_norm": 1.621509313583374, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.864013671875, + "num_tokens": 210514728.0, + "step": 5776 + }, + { + "epoch": 1.072794800371402, + "grad_norm": 1.6519817113876343, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8789495825767517, + "num_tokens": 210547265.0, + "step": 5777 + }, + { + "epoch": 1.0729805013927576, + "grad_norm": 1.634016513824463, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8816730380058289, + "num_tokens": 210582122.0, + "step": 5778 + }, + { + "epoch": 1.0731662024141133, + "grad_norm": 1.6479772329330444, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.860182523727417, + "num_tokens": 210619470.0, + "step": 5779 + }, + { + "epoch": 1.0733519034354688, + "grad_norm": 1.7035949230194092, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8814592361450195, + "num_tokens": 210649945.0, + "step": 5780 + }, + { + "epoch": 1.0735376044568246, + "grad_norm": 1.5500507354736328, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8752981424331665, + "num_tokens": 210685166.0, + "step": 5781 + }, + { + "epoch": 1.07372330547818, + "grad_norm": 1.563923954963684, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8732491731643677, + "num_tokens": 210723452.0, + "step": 5782 + }, + { + "epoch": 1.0739090064995358, + "grad_norm": 1.4907081127166748, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8700603246688843, + "num_tokens": 210765853.0, + "step": 5783 + }, + { + "epoch": 1.0740947075208913, + "grad_norm": 1.680119276046753, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8645287752151489, + "num_tokens": 210799749.0, + "step": 5784 + }, + { + "epoch": 1.074280408542247, + "grad_norm": 1.5974262952804565, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8718196153640747, + "num_tokens": 210832954.0, + "step": 5785 + }, + { + "epoch": 1.0744661095636026, + "grad_norm": 1.4497838020324707, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8711540699005127, + "num_tokens": 210874640.0, + "step": 5786 + }, + { + "epoch": 1.0746518105849583, + "grad_norm": 1.6329017877578735, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8776448369026184, + "num_tokens": 210908108.0, + "step": 5787 + }, + { + "epoch": 1.0748375116063138, + "grad_norm": 1.6270354986190796, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8779339790344238, + "num_tokens": 210941734.0, + "step": 5788 + }, + { + "epoch": 1.0750232126276695, + "grad_norm": 1.6154299974441528, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.862943172454834, + "num_tokens": 210981302.0, + "step": 5789 + }, + { + "epoch": 1.075208913649025, + "grad_norm": 1.6082892417907715, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8705050945281982, + "num_tokens": 211015133.0, + "step": 5790 + }, + { + "epoch": 1.0753946146703808, + "grad_norm": 1.4272485971450806, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8938932418823242, + "num_tokens": 211054772.0, + "step": 5791 + }, + { + "epoch": 1.0755803156917363, + "grad_norm": 1.547731876373291, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8794420957565308, + "num_tokens": 211089907.0, + "step": 5792 + }, + { + "epoch": 1.075766016713092, + "grad_norm": 1.6400643587112427, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8820816278457642, + "num_tokens": 211126691.0, + "step": 5793 + }, + { + "epoch": 1.0759517177344475, + "grad_norm": 1.738637089729309, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8794934749603271, + "num_tokens": 211160298.0, + "step": 5794 + }, + { + "epoch": 1.076137418755803, + "grad_norm": 1.5177462100982666, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8829287886619568, + "num_tokens": 211200174.0, + "step": 5795 + }, + { + "epoch": 1.0763231197771588, + "grad_norm": 1.654774785041809, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8602039813995361, + "num_tokens": 211235480.0, + "step": 5796 + }, + { + "epoch": 1.0765088207985145, + "grad_norm": 1.6246165037155151, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8818234205245972, + "num_tokens": 211270377.0, + "step": 5797 + }, + { + "epoch": 1.07669452181987, + "grad_norm": 1.709423542022705, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8878885507583618, + "num_tokens": 211304134.0, + "step": 5798 + }, + { + "epoch": 1.0768802228412255, + "grad_norm": 1.8964346647262573, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8744168281555176, + "num_tokens": 211330751.0, + "step": 5799 + }, + { + "epoch": 1.0770659238625813, + "grad_norm": 1.7376363277435303, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8757944107055664, + "num_tokens": 211361658.0, + "step": 5800 + }, + { + "epoch": 1.0772516248839368, + "grad_norm": 1.5319362878799438, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8824583292007446, + "num_tokens": 211400294.0, + "step": 5801 + }, + { + "epoch": 1.0774373259052925, + "grad_norm": 1.7030022144317627, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8767561912536621, + "num_tokens": 211430553.0, + "step": 5802 + }, + { + "epoch": 1.077623026926648, + "grad_norm": 1.4625730514526367, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8655821681022644, + "num_tokens": 211471779.0, + "step": 5803 + }, + { + "epoch": 1.0778087279480038, + "grad_norm": 1.4324389696121216, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8797734975814819, + "num_tokens": 211512173.0, + "step": 5804 + }, + { + "epoch": 1.0779944289693593, + "grad_norm": 1.5844095945358276, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8698805570602417, + "num_tokens": 211547170.0, + "step": 5805 + }, + { + "epoch": 1.078180129990715, + "grad_norm": 1.6347689628601074, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8803185820579529, + "num_tokens": 211579679.0, + "step": 5806 + }, + { + "epoch": 1.0783658310120705, + "grad_norm": 1.715951681137085, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8677464127540588, + "num_tokens": 211610470.0, + "step": 5807 + }, + { + "epoch": 1.0785515320334262, + "grad_norm": 1.6233824491500854, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8520917296409607, + "num_tokens": 211653158.0, + "step": 5808 + }, + { + "epoch": 1.0787372330547818, + "grad_norm": 1.6077795028686523, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8643908500671387, + "num_tokens": 211691796.0, + "step": 5809 + }, + { + "epoch": 1.0789229340761375, + "grad_norm": 1.5627268552780151, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.880763053894043, + "num_tokens": 211732914.0, + "step": 5810 + }, + { + "epoch": 1.079108635097493, + "grad_norm": 1.5854648351669312, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8835854530334473, + "num_tokens": 211767770.0, + "step": 5811 + }, + { + "epoch": 1.0792943361188487, + "grad_norm": 1.6630067825317383, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8665891885757446, + "num_tokens": 211806529.0, + "step": 5812 + }, + { + "epoch": 1.0794800371402042, + "grad_norm": 1.6241925954818726, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8828620910644531, + "num_tokens": 211839395.0, + "step": 5813 + }, + { + "epoch": 1.07966573816156, + "grad_norm": 1.5432833433151245, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8759322166442871, + "num_tokens": 211877761.0, + "step": 5814 + }, + { + "epoch": 1.0798514391829155, + "grad_norm": 1.5142991542816162, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8857177495956421, + "num_tokens": 211912171.0, + "step": 5815 + }, + { + "epoch": 1.0800371402042712, + "grad_norm": 1.593447208404541, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8769947290420532, + "num_tokens": 211948087.0, + "step": 5816 + }, + { + "epoch": 1.0802228412256267, + "grad_norm": 1.4751328229904175, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8768321871757507, + "num_tokens": 211989074.0, + "step": 5817 + }, + { + "epoch": 1.0804085422469825, + "grad_norm": 1.5648659467697144, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8751521110534668, + "num_tokens": 212024759.0, + "step": 5818 + }, + { + "epoch": 1.080594243268338, + "grad_norm": 1.6597672700881958, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.873935341835022, + "num_tokens": 212059055.0, + "step": 5819 + }, + { + "epoch": 1.0807799442896937, + "grad_norm": 1.4512689113616943, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8772666454315186, + "num_tokens": 212099677.0, + "step": 5820 + }, + { + "epoch": 1.0809656453110492, + "grad_norm": 1.5580506324768066, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8753415942192078, + "num_tokens": 212135369.0, + "step": 5821 + }, + { + "epoch": 1.0811513463324047, + "grad_norm": 1.6154991388320923, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.877785325050354, + "num_tokens": 212169062.0, + "step": 5822 + }, + { + "epoch": 1.0813370473537605, + "grad_norm": 1.6076823472976685, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8851079940795898, + "num_tokens": 212201845.0, + "step": 5823 + }, + { + "epoch": 1.081522748375116, + "grad_norm": 1.7484114170074463, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8857181072235107, + "num_tokens": 212231143.0, + "step": 5824 + }, + { + "epoch": 1.0817084493964717, + "grad_norm": 1.5382931232452393, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8788586854934692, + "num_tokens": 212267188.0, + "step": 5825 + }, + { + "epoch": 1.0818941504178272, + "grad_norm": 1.549147605895996, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8691698312759399, + "num_tokens": 212304814.0, + "step": 5826 + }, + { + "epoch": 1.082079851439183, + "grad_norm": 1.604683756828308, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8673364520072937, + "num_tokens": 212340117.0, + "step": 5827 + }, + { + "epoch": 1.0822655524605385, + "grad_norm": 1.6902896165847778, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8708805441856384, + "num_tokens": 212376876.0, + "step": 5828 + }, + { + "epoch": 1.0824512534818942, + "grad_norm": 1.473506212234497, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8871409893035889, + "num_tokens": 212413924.0, + "step": 5829 + }, + { + "epoch": 1.0826369545032497, + "grad_norm": 1.4708951711654663, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8709938526153564, + "num_tokens": 212454971.0, + "step": 5830 + }, + { + "epoch": 1.0828226555246054, + "grad_norm": 1.4392166137695312, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8938332796096802, + "num_tokens": 212492514.0, + "step": 5831 + }, + { + "epoch": 1.083008356545961, + "grad_norm": 1.5371445417404175, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8714907169342041, + "num_tokens": 212530686.0, + "step": 5832 + }, + { + "epoch": 1.0831940575673167, + "grad_norm": 1.5195708274841309, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8792473673820496, + "num_tokens": 212568383.0, + "step": 5833 + }, + { + "epoch": 1.0833797585886722, + "grad_norm": 1.44618821144104, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8838307857513428, + "num_tokens": 212607835.0, + "step": 5834 + }, + { + "epoch": 1.083565459610028, + "grad_norm": 1.7204668521881104, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8736432790756226, + "num_tokens": 212643507.0, + "step": 5835 + }, + { + "epoch": 1.0837511606313834, + "grad_norm": 1.4320429563522339, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8868883848190308, + "num_tokens": 212682463.0, + "step": 5836 + }, + { + "epoch": 1.0839368616527392, + "grad_norm": 1.5097324848175049, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8837887048721313, + "num_tokens": 212720169.0, + "step": 5837 + }, + { + "epoch": 1.0841225626740947, + "grad_norm": 1.523364543914795, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8799862861633301, + "num_tokens": 212753876.0, + "step": 5838 + }, + { + "epoch": 1.0843082636954504, + "grad_norm": 1.5962066650390625, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8700188994407654, + "num_tokens": 212790412.0, + "step": 5839 + }, + { + "epoch": 1.084493964716806, + "grad_norm": 1.6027077436447144, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8764691948890686, + "num_tokens": 212828412.0, + "step": 5840 + }, + { + "epoch": 1.0846796657381617, + "grad_norm": 1.5930196046829224, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8763790130615234, + "num_tokens": 212862800.0, + "step": 5841 + }, + { + "epoch": 1.0848653667595172, + "grad_norm": 1.5439947843551636, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8629028797149658, + "num_tokens": 212904984.0, + "step": 5842 + }, + { + "epoch": 1.085051067780873, + "grad_norm": 1.4715590476989746, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8890917301177979, + "num_tokens": 212938600.0, + "step": 5843 + }, + { + "epoch": 1.0852367688022284, + "grad_norm": 1.6265429258346558, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8494092226028442, + "num_tokens": 212980640.0, + "step": 5844 + }, + { + "epoch": 1.085422469823584, + "grad_norm": 1.6065222024917603, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8756598830223083, + "num_tokens": 213014067.0, + "step": 5845 + }, + { + "epoch": 1.0856081708449397, + "grad_norm": 1.5698766708374023, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8754048347473145, + "num_tokens": 213050746.0, + "step": 5846 + }, + { + "epoch": 1.0857938718662952, + "grad_norm": 1.4684160947799683, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8603969812393188, + "num_tokens": 213091222.0, + "step": 5847 + }, + { + "epoch": 1.085979572887651, + "grad_norm": 1.549692153930664, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.886949896812439, + "num_tokens": 213123806.0, + "step": 5848 + }, + { + "epoch": 1.0861652739090064, + "grad_norm": 1.5948561429977417, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8623130917549133, + "num_tokens": 213162839.0, + "step": 5849 + }, + { + "epoch": 1.0863509749303621, + "grad_norm": 1.7634692192077637, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8711689710617065, + "num_tokens": 213191987.0, + "step": 5850 + }, + { + "epoch": 1.0865366759517177, + "grad_norm": 1.5803838968276978, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8624626398086548, + "num_tokens": 213230185.0, + "step": 5851 + }, + { + "epoch": 1.0867223769730734, + "grad_norm": 1.780556082725525, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8530588746070862, + "num_tokens": 213260283.0, + "step": 5852 + }, + { + "epoch": 1.086908077994429, + "grad_norm": 1.4037758111953735, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8754475712776184, + "num_tokens": 213301795.0, + "step": 5853 + }, + { + "epoch": 1.0870937790157846, + "grad_norm": 1.5096510648727417, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8734608292579651, + "num_tokens": 213343267.0, + "step": 5854 + }, + { + "epoch": 1.0872794800371401, + "grad_norm": 1.5559264421463013, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8757566213607788, + "num_tokens": 213385738.0, + "step": 5855 + }, + { + "epoch": 1.0874651810584959, + "grad_norm": 1.6346861124038696, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8719449639320374, + "num_tokens": 213420576.0, + "step": 5856 + }, + { + "epoch": 1.0876508820798514, + "grad_norm": 1.5877623558044434, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8765951991081238, + "num_tokens": 213455832.0, + "step": 5857 + }, + { + "epoch": 1.0878365831012071, + "grad_norm": 1.5255862474441528, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8724491000175476, + "num_tokens": 213494159.0, + "step": 5858 + }, + { + "epoch": 1.0880222841225626, + "grad_norm": 1.6285086870193481, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8595767021179199, + "num_tokens": 213531796.0, + "step": 5859 + }, + { + "epoch": 1.0882079851439184, + "grad_norm": 1.5507818460464478, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8554367423057556, + "num_tokens": 213571841.0, + "step": 5860 + }, + { + "epoch": 1.0883936861652739, + "grad_norm": 1.4205139875411987, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8839141726493835, + "num_tokens": 213613096.0, + "step": 5861 + }, + { + "epoch": 1.0885793871866296, + "grad_norm": 1.511078953742981, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8858361840248108, + "num_tokens": 213650817.0, + "step": 5862 + }, + { + "epoch": 1.0887650882079851, + "grad_norm": 1.497680902481079, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8751659393310547, + "num_tokens": 213690445.0, + "step": 5863 + }, + { + "epoch": 1.0889507892293409, + "grad_norm": 1.8795843124389648, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8740407228469849, + "num_tokens": 213718860.0, + "step": 5864 + }, + { + "epoch": 1.0891364902506964, + "grad_norm": 1.5417782068252563, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8649738430976868, + "num_tokens": 213754694.0, + "step": 5865 + }, + { + "epoch": 1.089322191272052, + "grad_norm": 1.7443275451660156, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8684355616569519, + "num_tokens": 213786577.0, + "step": 5866 + }, + { + "epoch": 1.0895078922934076, + "grad_norm": 1.4848558902740479, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8805719614028931, + "num_tokens": 213824542.0, + "step": 5867 + }, + { + "epoch": 1.0896935933147631, + "grad_norm": 1.498183012008667, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8852527141571045, + "num_tokens": 213860431.0, + "step": 5868 + }, + { + "epoch": 1.0898792943361189, + "grad_norm": 1.589038372039795, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8796999454498291, + "num_tokens": 213890854.0, + "step": 5869 + }, + { + "epoch": 1.0900649953574744, + "grad_norm": 1.585699439048767, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.889259934425354, + "num_tokens": 213924643.0, + "step": 5870 + }, + { + "epoch": 1.09025069637883, + "grad_norm": 1.4583238363265991, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8862399458885193, + "num_tokens": 213965196.0, + "step": 5871 + }, + { + "epoch": 1.0904363974001856, + "grad_norm": 1.615181803703308, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8912585377693176, + "num_tokens": 213994581.0, + "step": 5872 + }, + { + "epoch": 1.0906220984215413, + "grad_norm": 1.537825345993042, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8764493465423584, + "num_tokens": 214031610.0, + "step": 5873 + }, + { + "epoch": 1.0908077994428969, + "grad_norm": 1.4917881488800049, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8891435861587524, + "num_tokens": 214068140.0, + "step": 5874 + }, + { + "epoch": 1.0909935004642526, + "grad_norm": 1.5715627670288086, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8900021314620972, + "num_tokens": 214103762.0, + "step": 5875 + }, + { + "epoch": 1.091179201485608, + "grad_norm": 1.4330769777297974, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8791048526763916, + "num_tokens": 214146973.0, + "step": 5876 + }, + { + "epoch": 1.0913649025069638, + "grad_norm": 1.5534653663635254, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8642500638961792, + "num_tokens": 214184520.0, + "step": 5877 + }, + { + "epoch": 1.0915506035283193, + "grad_norm": 1.5631338357925415, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8669024109840393, + "num_tokens": 214221391.0, + "step": 5878 + }, + { + "epoch": 1.091736304549675, + "grad_norm": 1.6091506481170654, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.87235027551651, + "num_tokens": 214254009.0, + "step": 5879 + }, + { + "epoch": 1.0919220055710306, + "grad_norm": 1.6024237871170044, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8757630586624146, + "num_tokens": 214288990.0, + "step": 5880 + }, + { + "epoch": 1.0921077065923863, + "grad_norm": 1.5373783111572266, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8870168924331665, + "num_tokens": 214326329.0, + "step": 5881 + }, + { + "epoch": 1.0922934076137418, + "grad_norm": 1.5291550159454346, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8867610692977905, + "num_tokens": 214361453.0, + "step": 5882 + }, + { + "epoch": 1.0924791086350976, + "grad_norm": 1.7222334146499634, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.879603385925293, + "num_tokens": 214394265.0, + "step": 5883 + }, + { + "epoch": 1.092664809656453, + "grad_norm": 1.5374417304992676, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8714289665222168, + "num_tokens": 214436751.0, + "step": 5884 + }, + { + "epoch": 1.0928505106778088, + "grad_norm": 1.5362025499343872, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.883963942527771, + "num_tokens": 214472558.0, + "step": 5885 + }, + { + "epoch": 1.0930362116991643, + "grad_norm": 1.5471099615097046, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8913121223449707, + "num_tokens": 214509796.0, + "step": 5886 + }, + { + "epoch": 1.09322191272052, + "grad_norm": 1.6838271617889404, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8746336698532104, + "num_tokens": 214543044.0, + "step": 5887 + }, + { + "epoch": 1.0934076137418756, + "grad_norm": 1.5624902248382568, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8662095665931702, + "num_tokens": 214583129.0, + "step": 5888 + }, + { + "epoch": 1.0935933147632313, + "grad_norm": 1.7253201007843018, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8726022243499756, + "num_tokens": 214615784.0, + "step": 5889 + }, + { + "epoch": 1.0937790157845868, + "grad_norm": 1.621411919593811, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8620048761367798, + "num_tokens": 214654367.0, + "step": 5890 + }, + { + "epoch": 1.0939647168059423, + "grad_norm": 1.4823644161224365, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8864994645118713, + "num_tokens": 214694072.0, + "step": 5891 + }, + { + "epoch": 1.094150417827298, + "grad_norm": 1.6525263786315918, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8791453242301941, + "num_tokens": 214725648.0, + "step": 5892 + }, + { + "epoch": 1.0943361188486538, + "grad_norm": 1.576637864112854, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8673117160797119, + "num_tokens": 214763050.0, + "step": 5893 + }, + { + "epoch": 1.0945218198700093, + "grad_norm": 1.6159363985061646, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8860563039779663, + "num_tokens": 214797479.0, + "step": 5894 + }, + { + "epoch": 1.0947075208913648, + "grad_norm": 1.5768702030181885, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8733060359954834, + "num_tokens": 214836440.0, + "step": 5895 + }, + { + "epoch": 1.0948932219127205, + "grad_norm": 1.5336347818374634, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8780215978622437, + "num_tokens": 214871835.0, + "step": 5896 + }, + { + "epoch": 1.095078922934076, + "grad_norm": 1.628117322921753, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8646801114082336, + "num_tokens": 214907809.0, + "step": 5897 + }, + { + "epoch": 1.0952646239554318, + "grad_norm": 1.514212727546692, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8907490372657776, + "num_tokens": 214943208.0, + "step": 5898 + }, + { + "epoch": 1.0954503249767873, + "grad_norm": 1.5375804901123047, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8783360123634338, + "num_tokens": 214983466.0, + "step": 5899 + }, + { + "epoch": 1.095636025998143, + "grad_norm": 1.5261080265045166, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8726195096969604, + "num_tokens": 215023388.0, + "step": 5900 + }, + { + "epoch": 1.0958217270194985, + "grad_norm": 1.6471599340438843, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.853157639503479, + "num_tokens": 215057803.0, + "step": 5901 + }, + { + "epoch": 1.0960074280408543, + "grad_norm": 1.6851965188980103, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8663021326065063, + "num_tokens": 215088377.0, + "step": 5902 + }, + { + "epoch": 1.0961931290622098, + "grad_norm": 1.587325096130371, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8680593371391296, + "num_tokens": 215128276.0, + "step": 5903 + }, + { + "epoch": 1.0963788300835655, + "grad_norm": 1.6632912158966064, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8769985437393188, + "num_tokens": 215165229.0, + "step": 5904 + }, + { + "epoch": 1.096564531104921, + "grad_norm": 1.5956461429595947, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8738052845001221, + "num_tokens": 215200997.0, + "step": 5905 + }, + { + "epoch": 1.0967502321262768, + "grad_norm": 1.548689603805542, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8825178742408752, + "num_tokens": 215242594.0, + "step": 5906 + }, + { + "epoch": 1.0969359331476323, + "grad_norm": 1.6008143424987793, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8693633079528809, + "num_tokens": 215277188.0, + "step": 5907 + }, + { + "epoch": 1.097121634168988, + "grad_norm": 1.4482742547988892, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8685683012008667, + "num_tokens": 215319651.0, + "step": 5908 + }, + { + "epoch": 1.0973073351903435, + "grad_norm": 1.6128921508789062, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8765194416046143, + "num_tokens": 215355349.0, + "step": 5909 + }, + { + "epoch": 1.0974930362116992, + "grad_norm": 1.689687967300415, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8727960586547852, + "num_tokens": 215387680.0, + "step": 5910 + }, + { + "epoch": 1.0976787372330548, + "grad_norm": 1.7419661283493042, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8681237697601318, + "num_tokens": 215420205.0, + "step": 5911 + }, + { + "epoch": 1.0978644382544105, + "grad_norm": 1.5385336875915527, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8892316818237305, + "num_tokens": 215453171.0, + "step": 5912 + }, + { + "epoch": 1.098050139275766, + "grad_norm": 1.6635013818740845, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8758343458175659, + "num_tokens": 215487802.0, + "step": 5913 + }, + { + "epoch": 1.0982358402971217, + "grad_norm": 1.6626800298690796, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.881885290145874, + "num_tokens": 215526888.0, + "step": 5914 + }, + { + "epoch": 1.0984215413184772, + "grad_norm": 1.6012428998947144, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8730865716934204, + "num_tokens": 215560690.0, + "step": 5915 + }, + { + "epoch": 1.098607242339833, + "grad_norm": 1.5022752285003662, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8706049919128418, + "num_tokens": 215598003.0, + "step": 5916 + }, + { + "epoch": 1.0987929433611885, + "grad_norm": 1.5368924140930176, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8687460422515869, + "num_tokens": 215633897.0, + "step": 5917 + }, + { + "epoch": 1.098978644382544, + "grad_norm": 1.69033944606781, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8737255930900574, + "num_tokens": 215666050.0, + "step": 5918 + }, + { + "epoch": 1.0991643454038997, + "grad_norm": 1.4970483779907227, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8847441673278809, + "num_tokens": 215704228.0, + "step": 5919 + }, + { + "epoch": 1.0993500464252552, + "grad_norm": 1.6223567724227905, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8734681010246277, + "num_tokens": 215738546.0, + "step": 5920 + }, + { + "epoch": 1.099535747446611, + "grad_norm": 1.4544380903244019, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8699337244033813, + "num_tokens": 215781498.0, + "step": 5921 + }, + { + "epoch": 1.0997214484679665, + "grad_norm": 1.3940984010696411, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8945648670196533, + "num_tokens": 215816730.0, + "step": 5922 + }, + { + "epoch": 1.0999071494893222, + "grad_norm": 1.6028668880462646, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8797006607055664, + "num_tokens": 215855297.0, + "step": 5923 + }, + { + "epoch": 1.1000928505106777, + "grad_norm": 1.5942633152008057, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8854727745056152, + "num_tokens": 215889548.0, + "step": 5924 + }, + { + "epoch": 1.1002785515320335, + "grad_norm": 1.530429720878601, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8835850358009338, + "num_tokens": 215926876.0, + "step": 5925 + }, + { + "epoch": 1.100464252553389, + "grad_norm": 1.6160486936569214, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8815662264823914, + "num_tokens": 215961180.0, + "step": 5926 + }, + { + "epoch": 1.1006499535747447, + "grad_norm": 1.541346549987793, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8863725662231445, + "num_tokens": 215994142.0, + "step": 5927 + }, + { + "epoch": 1.1008356545961002, + "grad_norm": 1.5167673826217651, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.876971960067749, + "num_tokens": 216030746.0, + "step": 5928 + }, + { + "epoch": 1.101021355617456, + "grad_norm": 1.403577208518982, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8789955377578735, + "num_tokens": 216071645.0, + "step": 5929 + }, + { + "epoch": 1.1012070566388115, + "grad_norm": 1.4634348154067993, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8826721906661987, + "num_tokens": 216110373.0, + "step": 5930 + }, + { + "epoch": 1.1013927576601672, + "grad_norm": 1.5452169179916382, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8873181939125061, + "num_tokens": 216148034.0, + "step": 5931 + }, + { + "epoch": 1.1015784586815227, + "grad_norm": 1.607761025428772, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8651968240737915, + "num_tokens": 216182638.0, + "step": 5932 + }, + { + "epoch": 1.1017641597028784, + "grad_norm": 1.5172561407089233, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8857001066207886, + "num_tokens": 216219064.0, + "step": 5933 + }, + { + "epoch": 1.101949860724234, + "grad_norm": 1.481671929359436, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8812042474746704, + "num_tokens": 216259217.0, + "step": 5934 + }, + { + "epoch": 1.1021355617455897, + "grad_norm": 1.5553570985794067, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8826934695243835, + "num_tokens": 216295993.0, + "step": 5935 + }, + { + "epoch": 1.1023212627669452, + "grad_norm": 1.6019189357757568, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8715893030166626, + "num_tokens": 216331682.0, + "step": 5936 + }, + { + "epoch": 1.102506963788301, + "grad_norm": 1.6292625665664673, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.88041090965271, + "num_tokens": 216364675.0, + "step": 5937 + }, + { + "epoch": 1.1026926648096564, + "grad_norm": 1.562787413597107, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8791016936302185, + "num_tokens": 216401516.0, + "step": 5938 + }, + { + "epoch": 1.1028783658310122, + "grad_norm": 1.465963363647461, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8869779109954834, + "num_tokens": 216441585.0, + "step": 5939 + }, + { + "epoch": 1.1030640668523677, + "grad_norm": 1.6444629430770874, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8579120635986328, + "num_tokens": 216478039.0, + "step": 5940 + }, + { + "epoch": 1.1032497678737232, + "grad_norm": 1.7695220708847046, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8766420483589172, + "num_tokens": 216506332.0, + "step": 5941 + }, + { + "epoch": 1.103435468895079, + "grad_norm": 1.4782694578170776, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8722841739654541, + "num_tokens": 216544491.0, + "step": 5942 + }, + { + "epoch": 1.1036211699164344, + "grad_norm": 1.4696192741394043, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8822873830795288, + "num_tokens": 216581631.0, + "step": 5943 + }, + { + "epoch": 1.1038068709377902, + "grad_norm": 1.4807425737380981, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8777849674224854, + "num_tokens": 216617838.0, + "step": 5944 + }, + { + "epoch": 1.1039925719591457, + "grad_norm": 1.5662205219268799, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8793265223503113, + "num_tokens": 216656182.0, + "step": 5945 + }, + { + "epoch": 1.1041782729805014, + "grad_norm": 1.492948293685913, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8844391107559204, + "num_tokens": 216693566.0, + "step": 5946 + }, + { + "epoch": 1.104363974001857, + "grad_norm": 1.617099404335022, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8616259694099426, + "num_tokens": 216731525.0, + "step": 5947 + }, + { + "epoch": 1.1045496750232127, + "grad_norm": 1.437913417816162, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8789344429969788, + "num_tokens": 216773851.0, + "step": 5948 + }, + { + "epoch": 1.1047353760445682, + "grad_norm": 1.5141973495483398, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.872818112373352, + "num_tokens": 216812616.0, + "step": 5949 + }, + { + "epoch": 1.104921077065924, + "grad_norm": 1.3793987035751343, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8849852085113525, + "num_tokens": 216854358.0, + "step": 5950 + }, + { + "epoch": 1.1051067780872794, + "grad_norm": 1.6104423999786377, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8553327918052673, + "num_tokens": 216892782.0, + "step": 5951 + }, + { + "epoch": 1.1052924791086352, + "grad_norm": 1.5020397901535034, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8727825284004211, + "num_tokens": 216932793.0, + "step": 5952 + }, + { + "epoch": 1.1054781801299907, + "grad_norm": 1.603672981262207, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8681628704071045, + "num_tokens": 216967523.0, + "step": 5953 + }, + { + "epoch": 1.1056638811513464, + "grad_norm": 1.6264225244522095, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8825138211250305, + "num_tokens": 217006961.0, + "step": 5954 + }, + { + "epoch": 1.105849582172702, + "grad_norm": 1.4975167512893677, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8745981454849243, + "num_tokens": 217045585.0, + "step": 5955 + }, + { + "epoch": 1.1060352831940576, + "grad_norm": 1.5012480020523071, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.881510853767395, + "num_tokens": 217081693.0, + "step": 5956 + }, + { + "epoch": 1.1062209842154132, + "grad_norm": 1.691975712776184, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8692222833633423, + "num_tokens": 217113338.0, + "step": 5957 + }, + { + "epoch": 1.1064066852367689, + "grad_norm": 1.5106337070465088, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8800337314605713, + "num_tokens": 217147342.0, + "step": 5958 + }, + { + "epoch": 1.1065923862581244, + "grad_norm": 1.5084573030471802, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8799141049385071, + "num_tokens": 217183908.0, + "step": 5959 + }, + { + "epoch": 1.1067780872794801, + "grad_norm": 1.9488540887832642, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.864564061164856, + "num_tokens": 217211934.0, + "step": 5960 + }, + { + "epoch": 1.1069637883008356, + "grad_norm": 1.5770606994628906, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8733490109443665, + "num_tokens": 217247639.0, + "step": 5961 + }, + { + "epoch": 1.1071494893221914, + "grad_norm": 1.4534093141555786, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8687183856964111, + "num_tokens": 217289239.0, + "step": 5962 + }, + { + "epoch": 1.1073351903435469, + "grad_norm": 1.622552514076233, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8745324611663818, + "num_tokens": 217324833.0, + "step": 5963 + }, + { + "epoch": 1.1075208913649024, + "grad_norm": 1.6589362621307373, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8655927181243896, + "num_tokens": 217357632.0, + "step": 5964 + }, + { + "epoch": 1.1077065923862581, + "grad_norm": 1.5793548822402954, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8743278980255127, + "num_tokens": 217396562.0, + "step": 5965 + }, + { + "epoch": 1.1078922934076139, + "grad_norm": 1.616446852684021, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8836941719055176, + "num_tokens": 217435359.0, + "step": 5966 + }, + { + "epoch": 1.1080779944289694, + "grad_norm": 1.4951616525650024, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8929644823074341, + "num_tokens": 217472466.0, + "step": 5967 + }, + { + "epoch": 1.1082636954503249, + "grad_norm": 1.5704838037490845, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.889643669128418, + "num_tokens": 217508713.0, + "step": 5968 + }, + { + "epoch": 1.1084493964716806, + "grad_norm": 1.4684308767318726, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8770010471343994, + "num_tokens": 217548335.0, + "step": 5969 + }, + { + "epoch": 1.1086350974930361, + "grad_norm": 1.5788428783416748, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8831325769424438, + "num_tokens": 217582281.0, + "step": 5970 + }, + { + "epoch": 1.1088207985143919, + "grad_norm": 1.549125075340271, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8842930793762207, + "num_tokens": 217618206.0, + "step": 5971 + }, + { + "epoch": 1.1090064995357474, + "grad_norm": 1.7636749744415283, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8763745427131653, + "num_tokens": 217649430.0, + "step": 5972 + }, + { + "epoch": 1.109192200557103, + "grad_norm": 1.499016523361206, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8825962543487549, + "num_tokens": 217685941.0, + "step": 5973 + }, + { + "epoch": 1.1093779015784586, + "grad_norm": 1.572885274887085, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8779005408287048, + "num_tokens": 217719976.0, + "step": 5974 + }, + { + "epoch": 1.1095636025998143, + "grad_norm": 1.5541374683380127, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8788868188858032, + "num_tokens": 217754012.0, + "step": 5975 + }, + { + "epoch": 1.1097493036211699, + "grad_norm": 1.560075044631958, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8765555620193481, + "num_tokens": 217788250.0, + "step": 5976 + }, + { + "epoch": 1.1099350046425256, + "grad_norm": 1.4958633184432983, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8771923184394836, + "num_tokens": 217826356.0, + "step": 5977 + }, + { + "epoch": 1.110120705663881, + "grad_norm": 1.4507149457931519, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8804086446762085, + "num_tokens": 217866878.0, + "step": 5978 + }, + { + "epoch": 1.1103064066852368, + "grad_norm": 1.636372447013855, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.866676926612854, + "num_tokens": 217903245.0, + "step": 5979 + }, + { + "epoch": 1.1104921077065923, + "grad_norm": 1.4838416576385498, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8880794048309326, + "num_tokens": 217936333.0, + "step": 5980 + }, + { + "epoch": 1.110677808727948, + "grad_norm": 1.4849075078964233, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8691998720169067, + "num_tokens": 217980405.0, + "step": 5981 + }, + { + "epoch": 1.1108635097493036, + "grad_norm": 1.440990924835205, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8787750005722046, + "num_tokens": 218022581.0, + "step": 5982 + }, + { + "epoch": 1.1110492107706593, + "grad_norm": 1.4568631649017334, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8882414102554321, + "num_tokens": 218059544.0, + "step": 5983 + }, + { + "epoch": 1.1112349117920148, + "grad_norm": 1.55134916305542, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.88918137550354, + "num_tokens": 218095142.0, + "step": 5984 + }, + { + "epoch": 1.1114206128133706, + "grad_norm": 1.6093471050262451, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8798834085464478, + "num_tokens": 218132424.0, + "step": 5985 + }, + { + "epoch": 1.111606313834726, + "grad_norm": 1.5535671710968018, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.884563684463501, + "num_tokens": 218167373.0, + "step": 5986 + }, + { + "epoch": 1.1117920148560818, + "grad_norm": 1.548594355583191, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8954677581787109, + "num_tokens": 218203575.0, + "step": 5987 + }, + { + "epoch": 1.1119777158774373, + "grad_norm": 1.6871366500854492, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.856034517288208, + "num_tokens": 218240078.0, + "step": 5988 + }, + { + "epoch": 1.112163416898793, + "grad_norm": 1.4712026119232178, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8821456432342529, + "num_tokens": 218279669.0, + "step": 5989 + }, + { + "epoch": 1.1123491179201486, + "grad_norm": 1.5986924171447754, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8712729215621948, + "num_tokens": 218315029.0, + "step": 5990 + }, + { + "epoch": 1.112534818941504, + "grad_norm": 1.5768595933914185, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8626530170440674, + "num_tokens": 218355458.0, + "step": 5991 + }, + { + "epoch": 1.1127205199628598, + "grad_norm": 1.3810534477233887, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8810451030731201, + "num_tokens": 218396247.0, + "step": 5992 + }, + { + "epoch": 1.1129062209842153, + "grad_norm": 1.551375389099121, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8775647878646851, + "num_tokens": 218438354.0, + "step": 5993 + }, + { + "epoch": 1.113091922005571, + "grad_norm": 1.771003246307373, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8659011721611023, + "num_tokens": 218470139.0, + "step": 5994 + }, + { + "epoch": 1.1132776230269266, + "grad_norm": 1.522632122039795, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8783326148986816, + "num_tokens": 218509179.0, + "step": 5995 + }, + { + "epoch": 1.1134633240482823, + "grad_norm": 1.530486822128296, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8591979742050171, + "num_tokens": 218552283.0, + "step": 5996 + }, + { + "epoch": 1.1136490250696378, + "grad_norm": 1.5120614767074585, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8880597352981567, + "num_tokens": 218590495.0, + "step": 5997 + }, + { + "epoch": 1.1138347260909935, + "grad_norm": 1.6215301752090454, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8777813911437988, + "num_tokens": 218625125.0, + "step": 5998 + }, + { + "epoch": 1.114020427112349, + "grad_norm": 1.526029348373413, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.860305666923523, + "num_tokens": 218667464.0, + "step": 5999 + }, + { + "epoch": 1.1142061281337048, + "grad_norm": 1.5453349351882935, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8841757774353027, + "num_tokens": 218702327.0, + "step": 6000 + }, + { + "epoch": 1.1143918291550603, + "grad_norm": 1.681056261062622, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8539150357246399, + "num_tokens": 218741747.0, + "step": 6001 + }, + { + "epoch": 1.114577530176416, + "grad_norm": 1.630088210105896, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8651586771011353, + "num_tokens": 218777009.0, + "step": 6002 + }, + { + "epoch": 1.1147632311977715, + "grad_norm": 1.4991145133972168, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8803978562355042, + "num_tokens": 218814777.0, + "step": 6003 + }, + { + "epoch": 1.1149489322191273, + "grad_norm": 1.5598515272140503, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8752177953720093, + "num_tokens": 218852424.0, + "step": 6004 + }, + { + "epoch": 1.1151346332404828, + "grad_norm": 1.493890404701233, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8849847316741943, + "num_tokens": 218890291.0, + "step": 6005 + }, + { + "epoch": 1.1153203342618385, + "grad_norm": 1.6066551208496094, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8624874949455261, + "num_tokens": 218926716.0, + "step": 6006 + }, + { + "epoch": 1.115506035283194, + "grad_norm": 1.463287115097046, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8697963953018188, + "num_tokens": 218966542.0, + "step": 6007 + }, + { + "epoch": 1.1156917363045498, + "grad_norm": 1.5806728601455688, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8797396421432495, + "num_tokens": 219002553.0, + "step": 6008 + }, + { + "epoch": 1.1158774373259053, + "grad_norm": 1.5730751752853394, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8739127516746521, + "num_tokens": 219039053.0, + "step": 6009 + }, + { + "epoch": 1.116063138347261, + "grad_norm": 1.4107365608215332, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8838421702384949, + "num_tokens": 219077308.0, + "step": 6010 + }, + { + "epoch": 1.1162488393686165, + "grad_norm": 1.4791550636291504, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.866600751876831, + "num_tokens": 219118588.0, + "step": 6011 + }, + { + "epoch": 1.1164345403899723, + "grad_norm": 1.4964983463287354, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8887032270431519, + "num_tokens": 219154890.0, + "step": 6012 + }, + { + "epoch": 1.1166202414113278, + "grad_norm": 1.6039657592773438, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8691489100456238, + "num_tokens": 219193517.0, + "step": 6013 + }, + { + "epoch": 1.1168059424326833, + "grad_norm": 1.6231677532196045, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8716605305671692, + "num_tokens": 219229038.0, + "step": 6014 + }, + { + "epoch": 1.116991643454039, + "grad_norm": 1.6825393438339233, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8768563866615295, + "num_tokens": 219260870.0, + "step": 6015 + }, + { + "epoch": 1.1171773444753945, + "grad_norm": 1.5004884004592896, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8758065700531006, + "num_tokens": 219299547.0, + "step": 6016 + }, + { + "epoch": 1.1173630454967503, + "grad_norm": 1.5893000364303589, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8852285146713257, + "num_tokens": 219335557.0, + "step": 6017 + }, + { + "epoch": 1.1175487465181058, + "grad_norm": 1.6821587085723877, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8819586038589478, + "num_tokens": 219367064.0, + "step": 6018 + }, + { + "epoch": 1.1177344475394615, + "grad_norm": 1.6748957633972168, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8763281106948853, + "num_tokens": 219399034.0, + "step": 6019 + }, + { + "epoch": 1.117920148560817, + "grad_norm": 1.5071322917938232, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8682979941368103, + "num_tokens": 219439272.0, + "step": 6020 + }, + { + "epoch": 1.1181058495821727, + "grad_norm": 1.6718826293945312, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8753150105476379, + "num_tokens": 219474411.0, + "step": 6021 + }, + { + "epoch": 1.1182915506035283, + "grad_norm": 1.5634534358978271, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8675799369812012, + "num_tokens": 219512171.0, + "step": 6022 + }, + { + "epoch": 1.118477251624884, + "grad_norm": 1.6306278705596924, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8896833062171936, + "num_tokens": 219544087.0, + "step": 6023 + }, + { + "epoch": 1.1186629526462395, + "grad_norm": 1.534214735031128, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.891302227973938, + "num_tokens": 219578282.0, + "step": 6024 + }, + { + "epoch": 1.1188486536675952, + "grad_norm": 1.7141246795654297, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8591362237930298, + "num_tokens": 219613127.0, + "step": 6025 + }, + { + "epoch": 1.1190343546889507, + "grad_norm": 1.548832893371582, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.876175045967102, + "num_tokens": 219649174.0, + "step": 6026 + }, + { + "epoch": 1.1192200557103065, + "grad_norm": 1.64693021774292, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8672738671302795, + "num_tokens": 219686323.0, + "step": 6027 + }, + { + "epoch": 1.119405756731662, + "grad_norm": 1.5732828378677368, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8847976326942444, + "num_tokens": 219720612.0, + "step": 6028 + }, + { + "epoch": 1.1195914577530177, + "grad_norm": 1.640310525894165, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8583351969718933, + "num_tokens": 219761441.0, + "step": 6029 + }, + { + "epoch": 1.1197771587743732, + "grad_norm": 1.5083988904953003, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8752644062042236, + "num_tokens": 219798628.0, + "step": 6030 + }, + { + "epoch": 1.119962859795729, + "grad_norm": 1.446183681488037, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.88564133644104, + "num_tokens": 219834970.0, + "step": 6031 + }, + { + "epoch": 1.1201485608170845, + "grad_norm": 1.4138215780258179, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8700243234634399, + "num_tokens": 219879410.0, + "step": 6032 + }, + { + "epoch": 1.1203342618384402, + "grad_norm": 1.6116822957992554, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8569499850273132, + "num_tokens": 219917525.0, + "step": 6033 + }, + { + "epoch": 1.1205199628597957, + "grad_norm": 1.488417387008667, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8782229423522949, + "num_tokens": 219953176.0, + "step": 6034 + }, + { + "epoch": 1.1207056638811514, + "grad_norm": 1.7178739309310913, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8862882256507874, + "num_tokens": 219983653.0, + "step": 6035 + }, + { + "epoch": 1.120891364902507, + "grad_norm": 1.5910395383834839, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8919864892959595, + "num_tokens": 220016214.0, + "step": 6036 + }, + { + "epoch": 1.1210770659238625, + "grad_norm": 1.488047480583191, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.88636314868927, + "num_tokens": 220054515.0, + "step": 6037 + }, + { + "epoch": 1.1212627669452182, + "grad_norm": 1.7451121807098389, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8564617037773132, + "num_tokens": 220087429.0, + "step": 6038 + }, + { + "epoch": 1.1214484679665737, + "grad_norm": 1.5777839422225952, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8677050471305847, + "num_tokens": 220122155.0, + "step": 6039 + }, + { + "epoch": 1.1216341689879294, + "grad_norm": 1.506829023361206, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8566040396690369, + "num_tokens": 220165233.0, + "step": 6040 + }, + { + "epoch": 1.121819870009285, + "grad_norm": 1.6736540794372559, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8621839284896851, + "num_tokens": 220204556.0, + "step": 6041 + }, + { + "epoch": 1.1220055710306407, + "grad_norm": 1.5163438320159912, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8838255405426025, + "num_tokens": 220240889.0, + "step": 6042 + }, + { + "epoch": 1.1221912720519962, + "grad_norm": 1.478044867515564, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8670058846473694, + "num_tokens": 220284919.0, + "step": 6043 + }, + { + "epoch": 1.122376973073352, + "grad_norm": 1.606389045715332, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8792135119438171, + "num_tokens": 220318896.0, + "step": 6044 + }, + { + "epoch": 1.1225626740947074, + "grad_norm": 1.5437177419662476, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8811825513839722, + "num_tokens": 220356070.0, + "step": 6045 + }, + { + "epoch": 1.1227483751160632, + "grad_norm": 1.4219536781311035, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8911451697349548, + "num_tokens": 220395954.0, + "step": 6046 + }, + { + "epoch": 1.1229340761374187, + "grad_norm": 1.5234146118164062, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8808014392852783, + "num_tokens": 220431349.0, + "step": 6047 + }, + { + "epoch": 1.1231197771587744, + "grad_norm": 1.6153918504714966, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.880912184715271, + "num_tokens": 220461307.0, + "step": 6048 + }, + { + "epoch": 1.12330547818013, + "grad_norm": 1.6393169164657593, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8613693714141846, + "num_tokens": 220496281.0, + "step": 6049 + }, + { + "epoch": 1.1234911792014857, + "grad_norm": 1.5759823322296143, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8599916696548462, + "num_tokens": 220531530.0, + "step": 6050 + }, + { + "epoch": 1.1236768802228412, + "grad_norm": 1.4820064306259155, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8868447542190552, + "num_tokens": 220569968.0, + "step": 6051 + }, + { + "epoch": 1.123862581244197, + "grad_norm": 1.5488104820251465, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8698311448097229, + "num_tokens": 220607675.0, + "step": 6052 + }, + { + "epoch": 1.1240482822655524, + "grad_norm": 1.5800381898880005, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8765145540237427, + "num_tokens": 220644420.0, + "step": 6053 + }, + { + "epoch": 1.1242339832869082, + "grad_norm": 1.665202260017395, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.879876971244812, + "num_tokens": 220674913.0, + "step": 6054 + }, + { + "epoch": 1.1244196843082637, + "grad_norm": 1.6530826091766357, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8640978336334229, + "num_tokens": 220708648.0, + "step": 6055 + }, + { + "epoch": 1.1246053853296194, + "grad_norm": 1.564497947692871, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.866538405418396, + "num_tokens": 220743610.0, + "step": 6056 + }, + { + "epoch": 1.124791086350975, + "grad_norm": 1.6848485469818115, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8708145022392273, + "num_tokens": 220777422.0, + "step": 6057 + }, + { + "epoch": 1.1249767873723306, + "grad_norm": 1.5264302492141724, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8806995153427124, + "num_tokens": 220814397.0, + "step": 6058 + }, + { + "epoch": 1.1251624883936862, + "grad_norm": 1.703271746635437, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8842061758041382, + "num_tokens": 220845319.0, + "step": 6059 + }, + { + "epoch": 1.1253481894150417, + "grad_norm": 1.7797507047653198, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8785513043403625, + "num_tokens": 220876388.0, + "step": 6060 + }, + { + "epoch": 1.1255338904363974, + "grad_norm": 1.7740538120269775, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8774484395980835, + "num_tokens": 220907508.0, + "step": 6061 + }, + { + "epoch": 1.1257195914577531, + "grad_norm": 1.4569711685180664, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8847090005874634, + "num_tokens": 220946398.0, + "step": 6062 + }, + { + "epoch": 1.1259052924791086, + "grad_norm": 1.4720945358276367, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8753424882888794, + "num_tokens": 220987580.0, + "step": 6063 + }, + { + "epoch": 1.1260909935004642, + "grad_norm": 1.7389079332351685, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8775773048400879, + "num_tokens": 221019901.0, + "step": 6064 + }, + { + "epoch": 1.1262766945218199, + "grad_norm": 1.661582589149475, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8805627822875977, + "num_tokens": 221052160.0, + "step": 6065 + }, + { + "epoch": 1.1264623955431754, + "grad_norm": 1.5931544303894043, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8756861090660095, + "num_tokens": 221088011.0, + "step": 6066 + }, + { + "epoch": 1.1266480965645311, + "grad_norm": 1.6760890483856201, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8766829967498779, + "num_tokens": 221121918.0, + "step": 6067 + }, + { + "epoch": 1.1268337975858866, + "grad_norm": 1.6254098415374756, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8662146925926208, + "num_tokens": 221159351.0, + "step": 6068 + }, + { + "epoch": 1.1270194986072424, + "grad_norm": 1.4517618417739868, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8604928851127625, + "num_tokens": 221201179.0, + "step": 6069 + }, + { + "epoch": 1.1272051996285979, + "grad_norm": 1.5882130861282349, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8698803186416626, + "num_tokens": 221239270.0, + "step": 6070 + }, + { + "epoch": 1.1273909006499536, + "grad_norm": 1.5712511539459229, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8727899789810181, + "num_tokens": 221279158.0, + "step": 6071 + }, + { + "epoch": 1.1275766016713091, + "grad_norm": 1.733556866645813, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8545747995376587, + "num_tokens": 221312984.0, + "step": 6072 + }, + { + "epoch": 1.1277623026926649, + "grad_norm": 1.5554720163345337, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8801056146621704, + "num_tokens": 221351007.0, + "step": 6073 + }, + { + "epoch": 1.1279480037140204, + "grad_norm": 1.6997885704040527, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8771697282791138, + "num_tokens": 221381140.0, + "step": 6074 + }, + { + "epoch": 1.128133704735376, + "grad_norm": 1.5806366205215454, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8702768087387085, + "num_tokens": 221417042.0, + "step": 6075 + }, + { + "epoch": 1.1283194057567316, + "grad_norm": 1.6244592666625977, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8802579641342163, + "num_tokens": 221449195.0, + "step": 6076 + }, + { + "epoch": 1.1285051067780874, + "grad_norm": 1.5172513723373413, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8684645295143127, + "num_tokens": 221489311.0, + "step": 6077 + }, + { + "epoch": 1.1286908077994429, + "grad_norm": 1.5550562143325806, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8701226711273193, + "num_tokens": 221530917.0, + "step": 6078 + }, + { + "epoch": 1.1288765088207986, + "grad_norm": 1.4803136587142944, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8818801045417786, + "num_tokens": 221570799.0, + "step": 6079 + }, + { + "epoch": 1.129062209842154, + "grad_norm": 1.5660526752471924, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8632765412330627, + "num_tokens": 221608897.0, + "step": 6080 + }, + { + "epoch": 1.1292479108635098, + "grad_norm": 1.4951666593551636, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8616291284561157, + "num_tokens": 221652288.0, + "step": 6081 + }, + { + "epoch": 1.1294336118848654, + "grad_norm": 1.3670166730880737, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8901000618934631, + "num_tokens": 221695367.0, + "step": 6082 + }, + { + "epoch": 1.1296193129062209, + "grad_norm": 1.6934714317321777, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8834234476089478, + "num_tokens": 221727640.0, + "step": 6083 + }, + { + "epoch": 1.1298050139275766, + "grad_norm": 1.5385249853134155, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8707387447357178, + "num_tokens": 221765421.0, + "step": 6084 + }, + { + "epoch": 1.1299907149489323, + "grad_norm": 1.5651848316192627, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8514025211334229, + "num_tokens": 221803044.0, + "step": 6085 + }, + { + "epoch": 1.1301764159702878, + "grad_norm": 1.695230484008789, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.886738121509552, + "num_tokens": 221834150.0, + "step": 6086 + }, + { + "epoch": 1.1303621169916434, + "grad_norm": 1.6182482242584229, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8577530980110168, + "num_tokens": 221875290.0, + "step": 6087 + }, + { + "epoch": 1.130547818012999, + "grad_norm": 1.5904861688613892, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8784102201461792, + "num_tokens": 221910970.0, + "step": 6088 + }, + { + "epoch": 1.1307335190343546, + "grad_norm": 1.4993687868118286, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8668043613433838, + "num_tokens": 221952778.0, + "step": 6089 + }, + { + "epoch": 1.1309192200557103, + "grad_norm": 1.4516701698303223, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8931549787521362, + "num_tokens": 221988580.0, + "step": 6090 + }, + { + "epoch": 1.1311049210770658, + "grad_norm": 1.5902884006500244, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8564438223838806, + "num_tokens": 222027277.0, + "step": 6091 + }, + { + "epoch": 1.1312906220984216, + "grad_norm": 1.6137880086898804, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8775842785835266, + "num_tokens": 222064372.0, + "step": 6092 + }, + { + "epoch": 1.131476323119777, + "grad_norm": 1.6218898296356201, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8835492730140686, + "num_tokens": 222098276.0, + "step": 6093 + }, + { + "epoch": 1.1316620241411328, + "grad_norm": 1.4018174409866333, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.882117509841919, + "num_tokens": 222138079.0, + "step": 6094 + }, + { + "epoch": 1.1318477251624883, + "grad_norm": 1.4799977540969849, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8732335567474365, + "num_tokens": 222179441.0, + "step": 6095 + }, + { + "epoch": 1.132033426183844, + "grad_norm": 1.55389404296875, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8734602928161621, + "num_tokens": 222216607.0, + "step": 6096 + }, + { + "epoch": 1.1322191272051996, + "grad_norm": 1.532801866531372, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8847000598907471, + "num_tokens": 222252339.0, + "step": 6097 + }, + { + "epoch": 1.1324048282265553, + "grad_norm": 1.5175384283065796, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8648021817207336, + "num_tokens": 222297701.0, + "step": 6098 + }, + { + "epoch": 1.1325905292479108, + "grad_norm": 1.67741858959198, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8889846205711365, + "num_tokens": 222328941.0, + "step": 6099 + }, + { + "epoch": 1.1327762302692665, + "grad_norm": 1.649090051651001, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8861922025680542, + "num_tokens": 222366238.0, + "step": 6100 + }, + { + "epoch": 1.132961931290622, + "grad_norm": 1.590821623802185, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8636070489883423, + "num_tokens": 222400516.0, + "step": 6101 + }, + { + "epoch": 1.1331476323119778, + "grad_norm": 1.6690677404403687, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8687056303024292, + "num_tokens": 222435075.0, + "step": 6102 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.4868767261505127, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8642269372940063, + "num_tokens": 222475615.0, + "step": 6103 + }, + { + "epoch": 1.133519034354689, + "grad_norm": 1.5682162046432495, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8718204498291016, + "num_tokens": 222510131.0, + "step": 6104 + }, + { + "epoch": 1.1337047353760445, + "grad_norm": 1.391918659210205, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.883412778377533, + "num_tokens": 222554158.0, + "step": 6105 + }, + { + "epoch": 1.1338904363974003, + "grad_norm": 1.5036851167678833, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8664945363998413, + "num_tokens": 222595133.0, + "step": 6106 + }, + { + "epoch": 1.1340761374187558, + "grad_norm": 1.5439642667770386, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8778471350669861, + "num_tokens": 222630422.0, + "step": 6107 + }, + { + "epoch": 1.1342618384401115, + "grad_norm": 1.517767310142517, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8704519271850586, + "num_tokens": 222667772.0, + "step": 6108 + }, + { + "epoch": 1.134447539461467, + "grad_norm": 1.504014015197754, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8758633732795715, + "num_tokens": 222708532.0, + "step": 6109 + }, + { + "epoch": 1.1346332404828225, + "grad_norm": 1.5894566774368286, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8760286569595337, + "num_tokens": 222743022.0, + "step": 6110 + }, + { + "epoch": 1.1348189415041783, + "grad_norm": 1.4887956380844116, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8805326223373413, + "num_tokens": 222780183.0, + "step": 6111 + }, + { + "epoch": 1.135004642525534, + "grad_norm": 1.4921526908874512, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8819605708122253, + "num_tokens": 222815011.0, + "step": 6112 + }, + { + "epoch": 1.1351903435468895, + "grad_norm": 1.4935799837112427, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8758785724639893, + "num_tokens": 222852725.0, + "step": 6113 + }, + { + "epoch": 1.135376044568245, + "grad_norm": 1.4981228113174438, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8838116526603699, + "num_tokens": 222888178.0, + "step": 6114 + }, + { + "epoch": 1.1355617455896008, + "grad_norm": 1.4796799421310425, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8760898113250732, + "num_tokens": 222929002.0, + "step": 6115 + }, + { + "epoch": 1.1357474466109563, + "grad_norm": 1.4809279441833496, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8829047679901123, + "num_tokens": 222968903.0, + "step": 6116 + }, + { + "epoch": 1.135933147632312, + "grad_norm": 1.6570357084274292, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.86822909116745, + "num_tokens": 222999421.0, + "step": 6117 + }, + { + "epoch": 1.1361188486536675, + "grad_norm": 1.5697640180587769, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8751984238624573, + "num_tokens": 223038169.0, + "step": 6118 + }, + { + "epoch": 1.1363045496750233, + "grad_norm": 1.5170053243637085, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8861587047576904, + "num_tokens": 223073213.0, + "step": 6119 + }, + { + "epoch": 1.1364902506963788, + "grad_norm": 1.6115120649337769, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8751468658447266, + "num_tokens": 223107188.0, + "step": 6120 + }, + { + "epoch": 1.1366759517177345, + "grad_norm": 1.447006344795227, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8734453916549683, + "num_tokens": 223151403.0, + "step": 6121 + }, + { + "epoch": 1.13686165273909, + "grad_norm": 1.7598774433135986, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.862533688545227, + "num_tokens": 223184701.0, + "step": 6122 + }, + { + "epoch": 1.1370473537604457, + "grad_norm": 1.4414165019989014, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8900686502456665, + "num_tokens": 223221576.0, + "step": 6123 + }, + { + "epoch": 1.1372330547818013, + "grad_norm": 1.5592782497406006, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8659254312515259, + "num_tokens": 223261048.0, + "step": 6124 + }, + { + "epoch": 1.137418755803157, + "grad_norm": 1.6097229719161987, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8779209852218628, + "num_tokens": 223301546.0, + "step": 6125 + }, + { + "epoch": 1.1376044568245125, + "grad_norm": 1.683661699295044, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8890812397003174, + "num_tokens": 223337935.0, + "step": 6126 + }, + { + "epoch": 1.1377901578458682, + "grad_norm": 1.684065580368042, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8660317659378052, + "num_tokens": 223370964.0, + "step": 6127 + }, + { + "epoch": 1.1379758588672237, + "grad_norm": 1.6469801664352417, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8660687208175659, + "num_tokens": 223408200.0, + "step": 6128 + }, + { + "epoch": 1.1381615598885795, + "grad_norm": 1.5422465801239014, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8859641551971436, + "num_tokens": 223445290.0, + "step": 6129 + }, + { + "epoch": 1.138347260909935, + "grad_norm": 1.5034345388412476, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.871085524559021, + "num_tokens": 223481874.0, + "step": 6130 + }, + { + "epoch": 1.1385329619312907, + "grad_norm": 1.5276175737380981, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8963779211044312, + "num_tokens": 223515155.0, + "step": 6131 + }, + { + "epoch": 1.1387186629526462, + "grad_norm": 1.6640249490737915, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.896622896194458, + "num_tokens": 223544004.0, + "step": 6132 + }, + { + "epoch": 1.1389043639740017, + "grad_norm": 1.4902092218399048, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8707218766212463, + "num_tokens": 223581410.0, + "step": 6133 + }, + { + "epoch": 1.1390900649953575, + "grad_norm": 1.6363565921783447, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8920618295669556, + "num_tokens": 223614263.0, + "step": 6134 + }, + { + "epoch": 1.1392757660167132, + "grad_norm": 1.6010384559631348, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8673760294914246, + "num_tokens": 223653088.0, + "step": 6135 + }, + { + "epoch": 1.1394614670380687, + "grad_norm": 1.5981701612472534, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8750498294830322, + "num_tokens": 223689613.0, + "step": 6136 + }, + { + "epoch": 1.1396471680594242, + "grad_norm": 1.5305254459381104, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.873439610004425, + "num_tokens": 223728475.0, + "step": 6137 + }, + { + "epoch": 1.13983286908078, + "grad_norm": 1.7753262519836426, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8441419005393982, + "num_tokens": 223763688.0, + "step": 6138 + }, + { + "epoch": 1.1400185701021355, + "grad_norm": 1.6312984228134155, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8724873661994934, + "num_tokens": 223799158.0, + "step": 6139 + }, + { + "epoch": 1.1402042711234912, + "grad_norm": 1.5883190631866455, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8740530014038086, + "num_tokens": 223835681.0, + "step": 6140 + }, + { + "epoch": 1.1403899721448467, + "grad_norm": 1.3964868783950806, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8970780372619629, + "num_tokens": 223873066.0, + "step": 6141 + }, + { + "epoch": 1.1405756731662025, + "grad_norm": 1.481905460357666, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8781178593635559, + "num_tokens": 223912636.0, + "step": 6142 + }, + { + "epoch": 1.140761374187558, + "grad_norm": 1.6454787254333496, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8873423337936401, + "num_tokens": 223945378.0, + "step": 6143 + }, + { + "epoch": 1.1409470752089137, + "grad_norm": 1.2960866689682007, + "learning_rate": 1e-06, + "loss": 0.2741, + "mean_token_accuracy": 0.9028248190879822, + "num_tokens": 223988152.0, + "step": 6144 + }, + { + "epoch": 1.1411327762302692, + "grad_norm": 1.4350650310516357, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8796179294586182, + "num_tokens": 224030904.0, + "step": 6145 + }, + { + "epoch": 1.141318477251625, + "grad_norm": 1.6162439584732056, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8751639127731323, + "num_tokens": 224063667.0, + "step": 6146 + }, + { + "epoch": 1.1415041782729805, + "grad_norm": 1.6274625062942505, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8746905326843262, + "num_tokens": 224099010.0, + "step": 6147 + }, + { + "epoch": 1.1416898792943362, + "grad_norm": 1.4926040172576904, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8765500783920288, + "num_tokens": 224138930.0, + "step": 6148 + }, + { + "epoch": 1.1418755803156917, + "grad_norm": 1.6018935441970825, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8618069291114807, + "num_tokens": 224177216.0, + "step": 6149 + }, + { + "epoch": 1.1420612813370474, + "grad_norm": 1.6545614004135132, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8722473382949829, + "num_tokens": 224212587.0, + "step": 6150 + }, + { + "epoch": 1.142246982358403, + "grad_norm": 1.4465322494506836, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8877452611923218, + "num_tokens": 224252338.0, + "step": 6151 + }, + { + "epoch": 1.1424326833797587, + "grad_norm": 1.723678469657898, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8864048719406128, + "num_tokens": 224285523.0, + "step": 6152 + }, + { + "epoch": 1.1426183844011142, + "grad_norm": 1.7023661136627197, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.868992030620575, + "num_tokens": 224320621.0, + "step": 6153 + }, + { + "epoch": 1.14280408542247, + "grad_norm": 1.571642518043518, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8698490262031555, + "num_tokens": 224355539.0, + "step": 6154 + }, + { + "epoch": 1.1429897864438254, + "grad_norm": 1.5906922817230225, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8760607242584229, + "num_tokens": 224391863.0, + "step": 6155 + }, + { + "epoch": 1.143175487465181, + "grad_norm": 1.6404815912246704, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8781588673591614, + "num_tokens": 224425747.0, + "step": 6156 + }, + { + "epoch": 1.1433611884865367, + "grad_norm": 1.6508405208587646, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8671425580978394, + "num_tokens": 224460151.0, + "step": 6157 + }, + { + "epoch": 1.1435468895078924, + "grad_norm": 1.6644059419631958, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.871846616268158, + "num_tokens": 224492677.0, + "step": 6158 + }, + { + "epoch": 1.143732590529248, + "grad_norm": 1.457306981086731, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8717929124832153, + "num_tokens": 224534098.0, + "step": 6159 + }, + { + "epoch": 1.1439182915506034, + "grad_norm": 1.7829004526138306, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8722043037414551, + "num_tokens": 224564472.0, + "step": 6160 + }, + { + "epoch": 1.1441039925719592, + "grad_norm": 1.4859002828598022, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.862945556640625, + "num_tokens": 224608635.0, + "step": 6161 + }, + { + "epoch": 1.1442896935933147, + "grad_norm": 1.5142580270767212, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.887204647064209, + "num_tokens": 224645268.0, + "step": 6162 + }, + { + "epoch": 1.1444753946146704, + "grad_norm": 1.6808099746704102, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8719849586486816, + "num_tokens": 224678725.0, + "step": 6163 + }, + { + "epoch": 1.144661095636026, + "grad_norm": 1.5506869554519653, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8676106333732605, + "num_tokens": 224715916.0, + "step": 6164 + }, + { + "epoch": 1.1448467966573816, + "grad_norm": 1.5976685285568237, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8650876879692078, + "num_tokens": 224753890.0, + "step": 6165 + }, + { + "epoch": 1.1450324976787372, + "grad_norm": 1.5073864459991455, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8812521696090698, + "num_tokens": 224790064.0, + "step": 6166 + }, + { + "epoch": 1.145218198700093, + "grad_norm": 1.5534709692001343, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8760274648666382, + "num_tokens": 224825027.0, + "step": 6167 + }, + { + "epoch": 1.1454038997214484, + "grad_norm": 1.470379114151001, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8713787794113159, + "num_tokens": 224864072.0, + "step": 6168 + }, + { + "epoch": 1.1455896007428041, + "grad_norm": 1.5625617504119873, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8743423223495483, + "num_tokens": 224901911.0, + "step": 6169 + }, + { + "epoch": 1.1457753017641596, + "grad_norm": 1.6953662633895874, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8920368552207947, + "num_tokens": 224932067.0, + "step": 6170 + }, + { + "epoch": 1.1459610027855154, + "grad_norm": 1.6013398170471191, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8664632439613342, + "num_tokens": 224971121.0, + "step": 6171 + }, + { + "epoch": 1.146146703806871, + "grad_norm": 1.507267713546753, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8914362192153931, + "num_tokens": 225007797.0, + "step": 6172 + }, + { + "epoch": 1.1463324048282266, + "grad_norm": 1.505208969116211, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8769481182098389, + "num_tokens": 225048531.0, + "step": 6173 + }, + { + "epoch": 1.1465181058495821, + "grad_norm": 1.5702605247497559, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8543712496757507, + "num_tokens": 225087956.0, + "step": 6174 + }, + { + "epoch": 1.1467038068709379, + "grad_norm": 1.5924001932144165, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8869771361351013, + "num_tokens": 225120708.0, + "step": 6175 + }, + { + "epoch": 1.1468895078922934, + "grad_norm": 1.5068249702453613, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8796176910400391, + "num_tokens": 225158950.0, + "step": 6176 + }, + { + "epoch": 1.1470752089136491, + "grad_norm": 1.548815369606018, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8664547204971313, + "num_tokens": 225194988.0, + "step": 6177 + }, + { + "epoch": 1.1472609099350046, + "grad_norm": 1.569724202156067, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8689870834350586, + "num_tokens": 225233724.0, + "step": 6178 + }, + { + "epoch": 1.1474466109563601, + "grad_norm": 1.6300861835479736, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8691263198852539, + "num_tokens": 225268443.0, + "step": 6179 + }, + { + "epoch": 1.1476323119777159, + "grad_norm": 1.7171097993850708, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8601224422454834, + "num_tokens": 225298644.0, + "step": 6180 + }, + { + "epoch": 1.1478180129990716, + "grad_norm": 1.5454013347625732, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.860602617263794, + "num_tokens": 225337113.0, + "step": 6181 + }, + { + "epoch": 1.1480037140204271, + "grad_norm": 1.5258207321166992, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8761355876922607, + "num_tokens": 225373401.0, + "step": 6182 + }, + { + "epoch": 1.1481894150417826, + "grad_norm": 1.6532877683639526, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8664164543151855, + "num_tokens": 225409773.0, + "step": 6183 + }, + { + "epoch": 1.1483751160631384, + "grad_norm": 1.5408365726470947, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8862109184265137, + "num_tokens": 225445047.0, + "step": 6184 + }, + { + "epoch": 1.1485608170844939, + "grad_norm": 1.4732621908187866, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8771704435348511, + "num_tokens": 225485701.0, + "step": 6185 + }, + { + "epoch": 1.1487465181058496, + "grad_norm": 1.7515897750854492, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8659950494766235, + "num_tokens": 225519478.0, + "step": 6186 + }, + { + "epoch": 1.1489322191272051, + "grad_norm": 1.5250380039215088, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.871676504611969, + "num_tokens": 225556453.0, + "step": 6187 + }, + { + "epoch": 1.1491179201485608, + "grad_norm": 1.4205713272094727, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8772872090339661, + "num_tokens": 225596757.0, + "step": 6188 + }, + { + "epoch": 1.1493036211699164, + "grad_norm": 1.8910768032073975, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8716464042663574, + "num_tokens": 225627963.0, + "step": 6189 + }, + { + "epoch": 1.149489322191272, + "grad_norm": 1.5799891948699951, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8667024374008179, + "num_tokens": 225663717.0, + "step": 6190 + }, + { + "epoch": 1.1496750232126276, + "grad_norm": 1.5373666286468506, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8734005689620972, + "num_tokens": 225703398.0, + "step": 6191 + }, + { + "epoch": 1.1498607242339833, + "grad_norm": 1.5452628135681152, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8631680011749268, + "num_tokens": 225743119.0, + "step": 6192 + }, + { + "epoch": 1.1500464252553388, + "grad_norm": 1.5820553302764893, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8757718801498413, + "num_tokens": 225778842.0, + "step": 6193 + }, + { + "epoch": 1.1502321262766946, + "grad_norm": 1.5431995391845703, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8726125955581665, + "num_tokens": 225816684.0, + "step": 6194 + }, + { + "epoch": 1.15041782729805, + "grad_norm": 1.6237927675247192, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8871651291847229, + "num_tokens": 225848527.0, + "step": 6195 + }, + { + "epoch": 1.1506035283194058, + "grad_norm": 1.497739315032959, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8774358034133911, + "num_tokens": 225887823.0, + "step": 6196 + }, + { + "epoch": 1.1507892293407613, + "grad_norm": 1.607519268989563, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.884609580039978, + "num_tokens": 225919064.0, + "step": 6197 + }, + { + "epoch": 1.150974930362117, + "grad_norm": 1.5164237022399902, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8761419057846069, + "num_tokens": 225954155.0, + "step": 6198 + }, + { + "epoch": 1.1511606313834726, + "grad_norm": 1.6414707899093628, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8694536685943604, + "num_tokens": 225991363.0, + "step": 6199 + }, + { + "epoch": 1.1513463324048283, + "grad_norm": 1.6533114910125732, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8848886489868164, + "num_tokens": 226023382.0, + "step": 6200 + }, + { + "epoch": 1.1515320334261838, + "grad_norm": 1.5438560247421265, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8754233717918396, + "num_tokens": 226061703.0, + "step": 6201 + }, + { + "epoch": 1.1517177344475396, + "grad_norm": 1.57802414894104, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8729686737060547, + "num_tokens": 226097226.0, + "step": 6202 + }, + { + "epoch": 1.151903435468895, + "grad_norm": 1.5312994718551636, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8872665166854858, + "num_tokens": 226130747.0, + "step": 6203 + }, + { + "epoch": 1.1520891364902508, + "grad_norm": 1.567350149154663, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8653185367584229, + "num_tokens": 226169575.0, + "step": 6204 + }, + { + "epoch": 1.1522748375116063, + "grad_norm": 1.5067839622497559, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8805213570594788, + "num_tokens": 226207507.0, + "step": 6205 + }, + { + "epoch": 1.1524605385329618, + "grad_norm": 1.824938178062439, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8730747103691101, + "num_tokens": 226236711.0, + "step": 6206 + }, + { + "epoch": 1.1526462395543176, + "grad_norm": 1.5966829061508179, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8890707492828369, + "num_tokens": 226272557.0, + "step": 6207 + }, + { + "epoch": 1.1528319405756733, + "grad_norm": 1.5454624891281128, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8860223293304443, + "num_tokens": 226305698.0, + "step": 6208 + }, + { + "epoch": 1.1530176415970288, + "grad_norm": 1.6328741312026978, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8773250579833984, + "num_tokens": 226338527.0, + "step": 6209 + }, + { + "epoch": 1.1532033426183843, + "grad_norm": 1.7231206893920898, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8817477822303772, + "num_tokens": 226367714.0, + "step": 6210 + }, + { + "epoch": 1.15338904363974, + "grad_norm": 1.611565351486206, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8855246901512146, + "num_tokens": 226398589.0, + "step": 6211 + }, + { + "epoch": 1.1535747446610956, + "grad_norm": 1.5084282159805298, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.875540554523468, + "num_tokens": 226439323.0, + "step": 6212 + }, + { + "epoch": 1.1537604456824513, + "grad_norm": 1.4272406101226807, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8935357928276062, + "num_tokens": 226476410.0, + "step": 6213 + }, + { + "epoch": 1.1539461467038068, + "grad_norm": 1.6526403427124023, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8695123195648193, + "num_tokens": 226511800.0, + "step": 6214 + }, + { + "epoch": 1.1541318477251625, + "grad_norm": 1.4649279117584229, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8881192207336426, + "num_tokens": 226551081.0, + "step": 6215 + }, + { + "epoch": 1.154317548746518, + "grad_norm": 1.5229958295822144, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.866095781326294, + "num_tokens": 226588431.0, + "step": 6216 + }, + { + "epoch": 1.1545032497678738, + "grad_norm": 1.5471417903900146, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8843035101890564, + "num_tokens": 226624402.0, + "step": 6217 + }, + { + "epoch": 1.1546889507892293, + "grad_norm": 1.4686036109924316, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8782135248184204, + "num_tokens": 226662318.0, + "step": 6218 + }, + { + "epoch": 1.154874651810585, + "grad_norm": 1.5266671180725098, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8721833825111389, + "num_tokens": 226700796.0, + "step": 6219 + }, + { + "epoch": 1.1550603528319405, + "grad_norm": 1.3983956575393677, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8919447064399719, + "num_tokens": 226740401.0, + "step": 6220 + }, + { + "epoch": 1.1552460538532963, + "grad_norm": 1.7116904258728027, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8744626045227051, + "num_tokens": 226774400.0, + "step": 6221 + }, + { + "epoch": 1.1554317548746518, + "grad_norm": 1.5148121118545532, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8727839589118958, + "num_tokens": 226817331.0, + "step": 6222 + }, + { + "epoch": 1.1556174558960075, + "grad_norm": 1.4256259202957153, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8715322017669678, + "num_tokens": 226859475.0, + "step": 6223 + }, + { + "epoch": 1.155803156917363, + "grad_norm": 1.6121854782104492, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8760297298431396, + "num_tokens": 226895228.0, + "step": 6224 + }, + { + "epoch": 1.1559888579387188, + "grad_norm": 1.4226598739624023, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.893122673034668, + "num_tokens": 226932461.0, + "step": 6225 + }, + { + "epoch": 1.1561745589600743, + "grad_norm": 1.5659797191619873, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.885694146156311, + "num_tokens": 226964383.0, + "step": 6226 + }, + { + "epoch": 1.15636025998143, + "grad_norm": 1.677114725112915, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8731867671012878, + "num_tokens": 227001061.0, + "step": 6227 + }, + { + "epoch": 1.1565459610027855, + "grad_norm": 1.6159111261367798, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8723345994949341, + "num_tokens": 227034962.0, + "step": 6228 + }, + { + "epoch": 1.156731662024141, + "grad_norm": 1.512997031211853, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8883336186408997, + "num_tokens": 227068702.0, + "step": 6229 + }, + { + "epoch": 1.1569173630454967, + "grad_norm": 1.493622064590454, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8806210160255432, + "num_tokens": 227107341.0, + "step": 6230 + }, + { + "epoch": 1.1571030640668525, + "grad_norm": 1.530143141746521, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8697177171707153, + "num_tokens": 227145956.0, + "step": 6231 + }, + { + "epoch": 1.157288765088208, + "grad_norm": 1.5458303689956665, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8857494592666626, + "num_tokens": 227184988.0, + "step": 6232 + }, + { + "epoch": 1.1574744661095635, + "grad_norm": 1.534340500831604, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8701733946800232, + "num_tokens": 227224956.0, + "step": 6233 + }, + { + "epoch": 1.1576601671309192, + "grad_norm": 1.6501030921936035, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.876162052154541, + "num_tokens": 227260536.0, + "step": 6234 + }, + { + "epoch": 1.1578458681522747, + "grad_norm": 1.6098512411117554, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8799007534980774, + "num_tokens": 227294428.0, + "step": 6235 + }, + { + "epoch": 1.1580315691736305, + "grad_norm": 1.5045791864395142, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8799915313720703, + "num_tokens": 227329842.0, + "step": 6236 + }, + { + "epoch": 1.158217270194986, + "grad_norm": 1.640760898590088, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.878896951675415, + "num_tokens": 227362317.0, + "step": 6237 + }, + { + "epoch": 1.1584029712163417, + "grad_norm": 1.5534590482711792, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8772134780883789, + "num_tokens": 227399738.0, + "step": 6238 + }, + { + "epoch": 1.1585886722376972, + "grad_norm": 1.538847804069519, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8729053735733032, + "num_tokens": 227436473.0, + "step": 6239 + }, + { + "epoch": 1.158774373259053, + "grad_norm": 1.614912986755371, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8761135339736938, + "num_tokens": 227471457.0, + "step": 6240 + }, + { + "epoch": 1.1589600742804085, + "grad_norm": 1.5495070219039917, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.864626407623291, + "num_tokens": 227508405.0, + "step": 6241 + }, + { + "epoch": 1.1591457753017642, + "grad_norm": 1.5188639163970947, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8706114292144775, + "num_tokens": 227547504.0, + "step": 6242 + }, + { + "epoch": 1.1593314763231197, + "grad_norm": 1.5963916778564453, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8798931837081909, + "num_tokens": 227580435.0, + "step": 6243 + }, + { + "epoch": 1.1595171773444755, + "grad_norm": 1.4868226051330566, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8909506797790527, + "num_tokens": 227617085.0, + "step": 6244 + }, + { + "epoch": 1.159702878365831, + "grad_norm": 1.458254337310791, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8811765909194946, + "num_tokens": 227658325.0, + "step": 6245 + }, + { + "epoch": 1.1598885793871867, + "grad_norm": 1.5694016218185425, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8673754930496216, + "num_tokens": 227695324.0, + "step": 6246 + }, + { + "epoch": 1.1600742804085422, + "grad_norm": 1.556591272354126, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8658899068832397, + "num_tokens": 227731574.0, + "step": 6247 + }, + { + "epoch": 1.160259981429898, + "grad_norm": 1.483109712600708, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8818813562393188, + "num_tokens": 227767886.0, + "step": 6248 + }, + { + "epoch": 1.1604456824512535, + "grad_norm": 1.4992886781692505, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8577576279640198, + "num_tokens": 227809817.0, + "step": 6249 + }, + { + "epoch": 1.1606313834726092, + "grad_norm": 1.4557428359985352, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8741768598556519, + "num_tokens": 227854049.0, + "step": 6250 + }, + { + "epoch": 1.1608170844939647, + "grad_norm": 1.5436811447143555, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8824856281280518, + "num_tokens": 227888176.0, + "step": 6251 + }, + { + "epoch": 1.1610027855153202, + "grad_norm": 1.526576280593872, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8685635328292847, + "num_tokens": 227929269.0, + "step": 6252 + }, + { + "epoch": 1.161188486536676, + "grad_norm": 1.5779597759246826, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8849582076072693, + "num_tokens": 227963118.0, + "step": 6253 + }, + { + "epoch": 1.1613741875580317, + "grad_norm": 1.6423574686050415, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8642059564590454, + "num_tokens": 228000732.0, + "step": 6254 + }, + { + "epoch": 1.1615598885793872, + "grad_norm": 1.6361013650894165, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8870806694030762, + "num_tokens": 228031288.0, + "step": 6255 + }, + { + "epoch": 1.1617455896007427, + "grad_norm": 1.576005458831787, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8926123380661011, + "num_tokens": 228063629.0, + "step": 6256 + }, + { + "epoch": 1.1619312906220984, + "grad_norm": 1.596994161605835, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8522068858146667, + "num_tokens": 228102038.0, + "step": 6257 + }, + { + "epoch": 1.162116991643454, + "grad_norm": 1.5892653465270996, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8690069913864136, + "num_tokens": 228138290.0, + "step": 6258 + }, + { + "epoch": 1.1623026926648097, + "grad_norm": 1.5212867259979248, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8653362989425659, + "num_tokens": 228179559.0, + "step": 6259 + }, + { + "epoch": 1.1624883936861652, + "grad_norm": 1.4727544784545898, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8823210000991821, + "num_tokens": 228215666.0, + "step": 6260 + }, + { + "epoch": 1.162674094707521, + "grad_norm": 1.557145357131958, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8746626377105713, + "num_tokens": 228253739.0, + "step": 6261 + }, + { + "epoch": 1.1628597957288764, + "grad_norm": 1.4272077083587646, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8833506107330322, + "num_tokens": 228295629.0, + "step": 6262 + }, + { + "epoch": 1.1630454967502322, + "grad_norm": 1.6080378293991089, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8765056729316711, + "num_tokens": 228330536.0, + "step": 6263 + }, + { + "epoch": 1.1632311977715877, + "grad_norm": 1.6600167751312256, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8749814033508301, + "num_tokens": 228360811.0, + "step": 6264 + }, + { + "epoch": 1.1634168987929434, + "grad_norm": 1.5523415803909302, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8659513592720032, + "num_tokens": 228399300.0, + "step": 6265 + }, + { + "epoch": 1.163602599814299, + "grad_norm": 1.600921869277954, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8853617906570435, + "num_tokens": 228433899.0, + "step": 6266 + }, + { + "epoch": 1.1637883008356547, + "grad_norm": 1.4883394241333008, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8780845999717712, + "num_tokens": 228470629.0, + "step": 6267 + }, + { + "epoch": 1.1639740018570102, + "grad_norm": 1.5683338642120361, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8733999133110046, + "num_tokens": 228508600.0, + "step": 6268 + }, + { + "epoch": 1.164159702878366, + "grad_norm": 1.4176445007324219, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8824533820152283, + "num_tokens": 228548685.0, + "step": 6269 + }, + { + "epoch": 1.1643454038997214, + "grad_norm": 1.4272150993347168, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8924679756164551, + "num_tokens": 228584763.0, + "step": 6270 + }, + { + "epoch": 1.1645311049210771, + "grad_norm": 1.474594235420227, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8736345767974854, + "num_tokens": 228623890.0, + "step": 6271 + }, + { + "epoch": 1.1647168059424327, + "grad_norm": 1.6694906949996948, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8820611238479614, + "num_tokens": 228656227.0, + "step": 6272 + }, + { + "epoch": 1.1649025069637884, + "grad_norm": 1.4313901662826538, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8874078989028931, + "num_tokens": 228694623.0, + "step": 6273 + }, + { + "epoch": 1.165088207985144, + "grad_norm": 1.4789658784866333, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8852862119674683, + "num_tokens": 228730963.0, + "step": 6274 + }, + { + "epoch": 1.1652739090064996, + "grad_norm": 1.5819213390350342, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8729984164237976, + "num_tokens": 228767097.0, + "step": 6275 + }, + { + "epoch": 1.1654596100278551, + "grad_norm": 1.6406490802764893, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8786534667015076, + "num_tokens": 228800864.0, + "step": 6276 + }, + { + "epoch": 1.1656453110492109, + "grad_norm": 1.4148606061935425, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8842899799346924, + "num_tokens": 228841852.0, + "step": 6277 + }, + { + "epoch": 1.1658310120705664, + "grad_norm": 1.5348924398422241, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8786395788192749, + "num_tokens": 228879630.0, + "step": 6278 + }, + { + "epoch": 1.166016713091922, + "grad_norm": 1.5318201780319214, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.878502607345581, + "num_tokens": 228915365.0, + "step": 6279 + }, + { + "epoch": 1.1662024141132776, + "grad_norm": 1.3798267841339111, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8720880150794983, + "num_tokens": 228960698.0, + "step": 6280 + }, + { + "epoch": 1.1663881151346334, + "grad_norm": 1.568424105644226, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8776293992996216, + "num_tokens": 228994236.0, + "step": 6281 + }, + { + "epoch": 1.1665738161559889, + "grad_norm": 1.5724329948425293, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8802851438522339, + "num_tokens": 229029030.0, + "step": 6282 + }, + { + "epoch": 1.1667595171773444, + "grad_norm": 1.420170545578003, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8802204132080078, + "num_tokens": 229068164.0, + "step": 6283 + }, + { + "epoch": 1.1669452181987001, + "grad_norm": 1.6552700996398926, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8843056559562683, + "num_tokens": 229101126.0, + "step": 6284 + }, + { + "epoch": 1.1671309192200556, + "grad_norm": 1.5987831354141235, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8797329664230347, + "num_tokens": 229135371.0, + "step": 6285 + }, + { + "epoch": 1.1673166202414114, + "grad_norm": 1.5118238925933838, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8751413822174072, + "num_tokens": 229173792.0, + "step": 6286 + }, + { + "epoch": 1.1675023212627669, + "grad_norm": 1.5882455110549927, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8695767521858215, + "num_tokens": 229214366.0, + "step": 6287 + }, + { + "epoch": 1.1676880222841226, + "grad_norm": 1.4027271270751953, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8812962174415588, + "num_tokens": 229254414.0, + "step": 6288 + }, + { + "epoch": 1.1678737233054781, + "grad_norm": 1.657730221748352, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8786635398864746, + "num_tokens": 229289445.0, + "step": 6289 + }, + { + "epoch": 1.1680594243268339, + "grad_norm": 1.6022289991378784, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8680974245071411, + "num_tokens": 229325879.0, + "step": 6290 + }, + { + "epoch": 1.1682451253481894, + "grad_norm": 1.4382381439208984, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8849607110023499, + "num_tokens": 229364666.0, + "step": 6291 + }, + { + "epoch": 1.168430826369545, + "grad_norm": 1.5339142084121704, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.867273211479187, + "num_tokens": 229402589.0, + "step": 6292 + }, + { + "epoch": 1.1686165273909006, + "grad_norm": 1.542250633239746, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8802756071090698, + "num_tokens": 229444006.0, + "step": 6293 + }, + { + "epoch": 1.1688022284122563, + "grad_norm": 1.5861321687698364, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8642795085906982, + "num_tokens": 229479986.0, + "step": 6294 + }, + { + "epoch": 1.1689879294336118, + "grad_norm": 1.6316829919815063, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8673608899116516, + "num_tokens": 229515078.0, + "step": 6295 + }, + { + "epoch": 1.1691736304549676, + "grad_norm": 1.6280527114868164, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.879068911075592, + "num_tokens": 229554182.0, + "step": 6296 + }, + { + "epoch": 1.169359331476323, + "grad_norm": 1.5735032558441162, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8646866083145142, + "num_tokens": 229592830.0, + "step": 6297 + }, + { + "epoch": 1.1695450324976788, + "grad_norm": 1.6565227508544922, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8953837156295776, + "num_tokens": 229624898.0, + "step": 6298 + }, + { + "epoch": 1.1697307335190343, + "grad_norm": 1.4431830644607544, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8786460161209106, + "num_tokens": 229666254.0, + "step": 6299 + }, + { + "epoch": 1.16991643454039, + "grad_norm": 1.7002958059310913, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.873844563961029, + "num_tokens": 229701096.0, + "step": 6300 + }, + { + "epoch": 1.1701021355617456, + "grad_norm": 1.5861616134643555, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8740042448043823, + "num_tokens": 229734825.0, + "step": 6301 + }, + { + "epoch": 1.170287836583101, + "grad_norm": 1.5685266256332397, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8763518333435059, + "num_tokens": 229769735.0, + "step": 6302 + }, + { + "epoch": 1.1704735376044568, + "grad_norm": 1.4723386764526367, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8875089883804321, + "num_tokens": 229808599.0, + "step": 6303 + }, + { + "epoch": 1.1706592386258126, + "grad_norm": 1.5337731838226318, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8893210887908936, + "num_tokens": 229843690.0, + "step": 6304 + }, + { + "epoch": 1.170844939647168, + "grad_norm": 1.4874197244644165, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8714860677719116, + "num_tokens": 229882665.0, + "step": 6305 + }, + { + "epoch": 1.1710306406685236, + "grad_norm": 1.4317973852157593, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8809999227523804, + "num_tokens": 229925462.0, + "step": 6306 + }, + { + "epoch": 1.1712163416898793, + "grad_norm": 1.6545144319534302, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8628822565078735, + "num_tokens": 229961647.0, + "step": 6307 + }, + { + "epoch": 1.1714020427112348, + "grad_norm": 1.5507737398147583, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8638744354248047, + "num_tokens": 229999329.0, + "step": 6308 + }, + { + "epoch": 1.1715877437325906, + "grad_norm": 1.4911630153656006, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8782922029495239, + "num_tokens": 230035642.0, + "step": 6309 + }, + { + "epoch": 1.171773444753946, + "grad_norm": 1.5609157085418701, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8608927130699158, + "num_tokens": 230073934.0, + "step": 6310 + }, + { + "epoch": 1.1719591457753018, + "grad_norm": 1.9022698402404785, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8810758590698242, + "num_tokens": 230110733.0, + "step": 6311 + }, + { + "epoch": 1.1721448467966573, + "grad_norm": 1.5726064443588257, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8689378499984741, + "num_tokens": 230150403.0, + "step": 6312 + }, + { + "epoch": 1.172330547818013, + "grad_norm": 1.5868301391601562, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8617661595344543, + "num_tokens": 230192551.0, + "step": 6313 + }, + { + "epoch": 1.1725162488393686, + "grad_norm": 1.5867820978164673, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.864590048789978, + "num_tokens": 230230778.0, + "step": 6314 + }, + { + "epoch": 1.1727019498607243, + "grad_norm": 1.534926414489746, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8708250522613525, + "num_tokens": 230268668.0, + "step": 6315 + }, + { + "epoch": 1.1728876508820798, + "grad_norm": 1.4147908687591553, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.892041802406311, + "num_tokens": 230308115.0, + "step": 6316 + }, + { + "epoch": 1.1730733519034355, + "grad_norm": 1.7505125999450684, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8576858043670654, + "num_tokens": 230342231.0, + "step": 6317 + }, + { + "epoch": 1.173259052924791, + "grad_norm": 1.518174409866333, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8759946823120117, + "num_tokens": 230380263.0, + "step": 6318 + }, + { + "epoch": 1.1734447539461468, + "grad_norm": 1.4746880531311035, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8739904165267944, + "num_tokens": 230420878.0, + "step": 6319 + }, + { + "epoch": 1.1736304549675023, + "grad_norm": 1.4947878122329712, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8872033357620239, + "num_tokens": 230457197.0, + "step": 6320 + }, + { + "epoch": 1.173816155988858, + "grad_norm": 1.4356461763381958, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8828655481338501, + "num_tokens": 230495663.0, + "step": 6321 + }, + { + "epoch": 1.1740018570102135, + "grad_norm": 1.6226725578308105, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8799739480018616, + "num_tokens": 230528054.0, + "step": 6322 + }, + { + "epoch": 1.1741875580315693, + "grad_norm": 1.5197962522506714, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8906489014625549, + "num_tokens": 230560350.0, + "step": 6323 + }, + { + "epoch": 1.1743732590529248, + "grad_norm": 1.5693590641021729, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.878089189529419, + "num_tokens": 230594995.0, + "step": 6324 + }, + { + "epoch": 1.1745589600742803, + "grad_norm": 1.501908540725708, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8624465465545654, + "num_tokens": 230634809.0, + "step": 6325 + }, + { + "epoch": 1.174744661095636, + "grad_norm": 1.4791449308395386, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8756632804870605, + "num_tokens": 230673365.0, + "step": 6326 + }, + { + "epoch": 1.1749303621169918, + "grad_norm": 1.5821787118911743, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8892534375190735, + "num_tokens": 230706176.0, + "step": 6327 + }, + { + "epoch": 1.1751160631383473, + "grad_norm": 1.6531156301498413, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8668429255485535, + "num_tokens": 230740030.0, + "step": 6328 + }, + { + "epoch": 1.1753017641597028, + "grad_norm": 1.5498095750808716, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8868516683578491, + "num_tokens": 230775293.0, + "step": 6329 + }, + { + "epoch": 1.1754874651810585, + "grad_norm": 1.4523698091506958, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8734880089759827, + "num_tokens": 230818329.0, + "step": 6330 + }, + { + "epoch": 1.175673166202414, + "grad_norm": 1.4487323760986328, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.878339409828186, + "num_tokens": 230858740.0, + "step": 6331 + }, + { + "epoch": 1.1758588672237698, + "grad_norm": 1.578008770942688, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8695443868637085, + "num_tokens": 230894756.0, + "step": 6332 + }, + { + "epoch": 1.1760445682451253, + "grad_norm": 1.479914903640747, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8795835971832275, + "num_tokens": 230931206.0, + "step": 6333 + }, + { + "epoch": 1.176230269266481, + "grad_norm": 1.5700159072875977, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8651244044303894, + "num_tokens": 230968531.0, + "step": 6334 + }, + { + "epoch": 1.1764159702878365, + "grad_norm": 1.5255945920944214, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8832067847251892, + "num_tokens": 231006504.0, + "step": 6335 + }, + { + "epoch": 1.1766016713091922, + "grad_norm": 1.4907479286193848, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8790959119796753, + "num_tokens": 231045981.0, + "step": 6336 + }, + { + "epoch": 1.1767873723305478, + "grad_norm": 1.7235372066497803, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8789604902267456, + "num_tokens": 231074826.0, + "step": 6337 + }, + { + "epoch": 1.1769730733519035, + "grad_norm": 1.4409054517745972, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8596300482749939, + "num_tokens": 231121363.0, + "step": 6338 + }, + { + "epoch": 1.177158774373259, + "grad_norm": 1.6035960912704468, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8754163384437561, + "num_tokens": 231154494.0, + "step": 6339 + }, + { + "epoch": 1.1773444753946147, + "grad_norm": 1.6090753078460693, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8821278214454651, + "num_tokens": 231189736.0, + "step": 6340 + }, + { + "epoch": 1.1775301764159702, + "grad_norm": 1.4985718727111816, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8642359972000122, + "num_tokens": 231231410.0, + "step": 6341 + }, + { + "epoch": 1.177715877437326, + "grad_norm": 1.7483378648757935, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8782464265823364, + "num_tokens": 231264248.0, + "step": 6342 + }, + { + "epoch": 1.1779015784586815, + "grad_norm": 1.490010380744934, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8737212419509888, + "num_tokens": 231304621.0, + "step": 6343 + }, + { + "epoch": 1.1780872794800372, + "grad_norm": 1.5035679340362549, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8913125991821289, + "num_tokens": 231337201.0, + "step": 6344 + }, + { + "epoch": 1.1782729805013927, + "grad_norm": 1.614332675933838, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8719556331634521, + "num_tokens": 231374532.0, + "step": 6345 + }, + { + "epoch": 1.1784586815227485, + "grad_norm": 1.5677779912948608, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8658275008201599, + "num_tokens": 231413891.0, + "step": 6346 + }, + { + "epoch": 1.178644382544104, + "grad_norm": 1.6038703918457031, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8765150904655457, + "num_tokens": 231448059.0, + "step": 6347 + }, + { + "epoch": 1.1788300835654595, + "grad_norm": 1.6043697595596313, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8775075674057007, + "num_tokens": 231483556.0, + "step": 6348 + }, + { + "epoch": 1.1790157845868152, + "grad_norm": 1.5405017137527466, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8785814046859741, + "num_tokens": 231517916.0, + "step": 6349 + }, + { + "epoch": 1.179201485608171, + "grad_norm": 1.6256295442581177, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8541206121444702, + "num_tokens": 231552237.0, + "step": 6350 + }, + { + "epoch": 1.1793871866295265, + "grad_norm": 1.4312174320220947, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8899251818656921, + "num_tokens": 231589426.0, + "step": 6351 + }, + { + "epoch": 1.179572887650882, + "grad_norm": 1.4156465530395508, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8815187215805054, + "num_tokens": 231630153.0, + "step": 6352 + }, + { + "epoch": 1.1797585886722377, + "grad_norm": 1.5454682111740112, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8763701915740967, + "num_tokens": 231663674.0, + "step": 6353 + }, + { + "epoch": 1.1799442896935932, + "grad_norm": 1.5298559665679932, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8782694935798645, + "num_tokens": 231701640.0, + "step": 6354 + }, + { + "epoch": 1.180129990714949, + "grad_norm": 1.4425545930862427, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8857828974723816, + "num_tokens": 231746630.0, + "step": 6355 + }, + { + "epoch": 1.1803156917363045, + "grad_norm": 1.6053346395492554, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8699211478233337, + "num_tokens": 231784175.0, + "step": 6356 + }, + { + "epoch": 1.1805013927576602, + "grad_norm": 1.6192858219146729, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8712498545646667, + "num_tokens": 231821581.0, + "step": 6357 + }, + { + "epoch": 1.1806870937790157, + "grad_norm": 1.5509511232376099, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.874969482421875, + "num_tokens": 231859700.0, + "step": 6358 + }, + { + "epoch": 1.1808727948003714, + "grad_norm": 1.5599651336669922, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8746262788772583, + "num_tokens": 231896159.0, + "step": 6359 + }, + { + "epoch": 1.181058495821727, + "grad_norm": 1.659257173538208, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8717622756958008, + "num_tokens": 231933982.0, + "step": 6360 + }, + { + "epoch": 1.1812441968430827, + "grad_norm": 1.768896460533142, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8560844659805298, + "num_tokens": 231972086.0, + "step": 6361 + }, + { + "epoch": 1.1814298978644382, + "grad_norm": 1.5443960428237915, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8889095783233643, + "num_tokens": 232004893.0, + "step": 6362 + }, + { + "epoch": 1.181615598885794, + "grad_norm": 1.5398707389831543, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8801367282867432, + "num_tokens": 232042842.0, + "step": 6363 + }, + { + "epoch": 1.1818012999071494, + "grad_norm": 1.5110381841659546, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8728204369544983, + "num_tokens": 232081335.0, + "step": 6364 + }, + { + "epoch": 1.1819870009285052, + "grad_norm": 1.5424034595489502, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8908556699752808, + "num_tokens": 232115644.0, + "step": 6365 + }, + { + "epoch": 1.1821727019498607, + "grad_norm": 1.5205652713775635, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8834323287010193, + "num_tokens": 232150001.0, + "step": 6366 + }, + { + "epoch": 1.1823584029712164, + "grad_norm": 1.6173945665359497, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8798673748970032, + "num_tokens": 232184928.0, + "step": 6367 + }, + { + "epoch": 1.182544103992572, + "grad_norm": 1.7233829498291016, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8736971616744995, + "num_tokens": 232214443.0, + "step": 6368 + }, + { + "epoch": 1.1827298050139277, + "grad_norm": 1.6670646667480469, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8817059993743896, + "num_tokens": 232245616.0, + "step": 6369 + }, + { + "epoch": 1.1829155060352832, + "grad_norm": 1.61235773563385, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8833447694778442, + "num_tokens": 232280497.0, + "step": 6370 + }, + { + "epoch": 1.183101207056639, + "grad_norm": 1.5658236742019653, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8703570365905762, + "num_tokens": 232320316.0, + "step": 6371 + }, + { + "epoch": 1.1832869080779944, + "grad_norm": 1.4612494707107544, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8701989650726318, + "num_tokens": 232362199.0, + "step": 6372 + }, + { + "epoch": 1.1834726090993501, + "grad_norm": 1.646652102470398, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8546577095985413, + "num_tokens": 232399039.0, + "step": 6373 + }, + { + "epoch": 1.1836583101207057, + "grad_norm": 1.574245810508728, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8860041499137878, + "num_tokens": 232434040.0, + "step": 6374 + }, + { + "epoch": 1.1838440111420612, + "grad_norm": 1.4260212182998657, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.873593270778656, + "num_tokens": 232473114.0, + "step": 6375 + }, + { + "epoch": 1.184029712163417, + "grad_norm": 1.592555284500122, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8624649047851562, + "num_tokens": 232511966.0, + "step": 6376 + }, + { + "epoch": 1.1842154131847726, + "grad_norm": 1.646630048751831, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8799796104431152, + "num_tokens": 232547539.0, + "step": 6377 + }, + { + "epoch": 1.1844011142061281, + "grad_norm": 1.480220079421997, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8778603076934814, + "num_tokens": 232589785.0, + "step": 6378 + }, + { + "epoch": 1.1845868152274837, + "grad_norm": 1.5047160387039185, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8704615831375122, + "num_tokens": 232628914.0, + "step": 6379 + }, + { + "epoch": 1.1847725162488394, + "grad_norm": 1.60512113571167, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8714925050735474, + "num_tokens": 232663565.0, + "step": 6380 + }, + { + "epoch": 1.184958217270195, + "grad_norm": 1.4948769807815552, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8746424317359924, + "num_tokens": 232701994.0, + "step": 6381 + }, + { + "epoch": 1.1851439182915506, + "grad_norm": 1.5710625648498535, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8733357191085815, + "num_tokens": 232739952.0, + "step": 6382 + }, + { + "epoch": 1.1853296193129061, + "grad_norm": 1.4940462112426758, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8831583261489868, + "num_tokens": 232777620.0, + "step": 6383 + }, + { + "epoch": 1.1855153203342619, + "grad_norm": 1.7005627155303955, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8797731399536133, + "num_tokens": 232809235.0, + "step": 6384 + }, + { + "epoch": 1.1857010213556174, + "grad_norm": 1.5130834579467773, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8787314891815186, + "num_tokens": 232843697.0, + "step": 6385 + }, + { + "epoch": 1.1858867223769731, + "grad_norm": 1.6357430219650269, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8762673735618591, + "num_tokens": 232878650.0, + "step": 6386 + }, + { + "epoch": 1.1860724233983286, + "grad_norm": 1.7209335565567017, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8661744594573975, + "num_tokens": 232910675.0, + "step": 6387 + }, + { + "epoch": 1.1862581244196844, + "grad_norm": 1.5478997230529785, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8802213668823242, + "num_tokens": 232946008.0, + "step": 6388 + }, + { + "epoch": 1.1864438254410399, + "grad_norm": 1.621446132659912, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8577732443809509, + "num_tokens": 232983480.0, + "step": 6389 + }, + { + "epoch": 1.1866295264623956, + "grad_norm": 1.4655287265777588, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8749579787254333, + "num_tokens": 233024996.0, + "step": 6390 + }, + { + "epoch": 1.1868152274837511, + "grad_norm": 1.5262916088104248, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8892422318458557, + "num_tokens": 233056739.0, + "step": 6391 + }, + { + "epoch": 1.1870009285051069, + "grad_norm": 1.5653352737426758, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8824578523635864, + "num_tokens": 233089524.0, + "step": 6392 + }, + { + "epoch": 1.1871866295264624, + "grad_norm": 1.4785497188568115, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.884146511554718, + "num_tokens": 233129410.0, + "step": 6393 + }, + { + "epoch": 1.187372330547818, + "grad_norm": 1.5599266290664673, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8732007741928101, + "num_tokens": 233165728.0, + "step": 6394 + }, + { + "epoch": 1.1875580315691736, + "grad_norm": 1.4391142129898071, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8852350115776062, + "num_tokens": 233205746.0, + "step": 6395 + }, + { + "epoch": 1.1877437325905293, + "grad_norm": 1.5813535451889038, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8710113763809204, + "num_tokens": 233241400.0, + "step": 6396 + }, + { + "epoch": 1.1879294336118849, + "grad_norm": 1.59273099899292, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8677564263343811, + "num_tokens": 233277691.0, + "step": 6397 + }, + { + "epoch": 1.1881151346332404, + "grad_norm": 1.6001865863800049, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8650898933410645, + "num_tokens": 233313774.0, + "step": 6398 + }, + { + "epoch": 1.188300835654596, + "grad_norm": 1.4552969932556152, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8873542547225952, + "num_tokens": 233353214.0, + "step": 6399 + }, + { + "epoch": 1.1884865366759518, + "grad_norm": 1.5401690006256104, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8706513047218323, + "num_tokens": 233392263.0, + "step": 6400 + }, + { + "epoch": 1.1886722376973073, + "grad_norm": 1.6106367111206055, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8700354099273682, + "num_tokens": 233427453.0, + "step": 6401 + }, + { + "epoch": 1.1888579387186629, + "grad_norm": 1.5678045749664307, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8623251914978027, + "num_tokens": 233465674.0, + "step": 6402 + }, + { + "epoch": 1.1890436397400186, + "grad_norm": 1.5986567735671997, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8644239902496338, + "num_tokens": 233501729.0, + "step": 6403 + }, + { + "epoch": 1.189229340761374, + "grad_norm": 1.6600525379180908, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8856711387634277, + "num_tokens": 233533095.0, + "step": 6404 + }, + { + "epoch": 1.1894150417827298, + "grad_norm": 1.5147050619125366, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8760907649993896, + "num_tokens": 233571404.0, + "step": 6405 + }, + { + "epoch": 1.1896007428040853, + "grad_norm": 1.49744713306427, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8741652965545654, + "num_tokens": 233608865.0, + "step": 6406 + }, + { + "epoch": 1.189786443825441, + "grad_norm": 1.558693766593933, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.869250476360321, + "num_tokens": 233647143.0, + "step": 6407 + }, + { + "epoch": 1.1899721448467966, + "grad_norm": 1.6095304489135742, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8604724407196045, + "num_tokens": 233683592.0, + "step": 6408 + }, + { + "epoch": 1.1901578458681523, + "grad_norm": 1.5578594207763672, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8895360827445984, + "num_tokens": 233719134.0, + "step": 6409 + }, + { + "epoch": 1.1903435468895078, + "grad_norm": 1.768916130065918, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8570300340652466, + "num_tokens": 233750157.0, + "step": 6410 + }, + { + "epoch": 1.1905292479108636, + "grad_norm": 1.469425082206726, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.878015398979187, + "num_tokens": 233792846.0, + "step": 6411 + }, + { + "epoch": 1.190714948932219, + "grad_norm": 1.6250996589660645, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8751863837242126, + "num_tokens": 233825502.0, + "step": 6412 + }, + { + "epoch": 1.1909006499535748, + "grad_norm": 1.4234845638275146, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8917018175125122, + "num_tokens": 233864319.0, + "step": 6413 + }, + { + "epoch": 1.1910863509749303, + "grad_norm": 1.6023808717727661, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8815513849258423, + "num_tokens": 233900094.0, + "step": 6414 + }, + { + "epoch": 1.191272051996286, + "grad_norm": 1.424118995666504, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8821873664855957, + "num_tokens": 233938733.0, + "step": 6415 + }, + { + "epoch": 1.1914577530176416, + "grad_norm": 1.5840708017349243, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8939251899719238, + "num_tokens": 233969787.0, + "step": 6416 + }, + { + "epoch": 1.1916434540389973, + "grad_norm": 1.691582202911377, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8858342170715332, + "num_tokens": 233999074.0, + "step": 6417 + }, + { + "epoch": 1.1918291550603528, + "grad_norm": 1.5469990968704224, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8694339394569397, + "num_tokens": 234034440.0, + "step": 6418 + }, + { + "epoch": 1.1920148560817085, + "grad_norm": 1.5902290344238281, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8764233589172363, + "num_tokens": 234070279.0, + "step": 6419 + }, + { + "epoch": 1.192200557103064, + "grad_norm": 1.5292556285858154, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8541300296783447, + "num_tokens": 234113067.0, + "step": 6420 + }, + { + "epoch": 1.1923862581244196, + "grad_norm": 1.4867768287658691, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8854377269744873, + "num_tokens": 234152032.0, + "step": 6421 + }, + { + "epoch": 1.1925719591457753, + "grad_norm": 1.5726670026779175, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8909989595413208, + "num_tokens": 234189955.0, + "step": 6422 + }, + { + "epoch": 1.192757660167131, + "grad_norm": 1.6982266902923584, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8770809173583984, + "num_tokens": 234221564.0, + "step": 6423 + }, + { + "epoch": 1.1929433611884865, + "grad_norm": 1.605156660079956, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8710891604423523, + "num_tokens": 234256954.0, + "step": 6424 + }, + { + "epoch": 1.193129062209842, + "grad_norm": 1.5359644889831543, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8914729356765747, + "num_tokens": 234292664.0, + "step": 6425 + }, + { + "epoch": 1.1933147632311978, + "grad_norm": 1.4725338220596313, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8854026794433594, + "num_tokens": 234330059.0, + "step": 6426 + }, + { + "epoch": 1.1935004642525533, + "grad_norm": 1.5627334117889404, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.880902886390686, + "num_tokens": 234363170.0, + "step": 6427 + }, + { + "epoch": 1.193686165273909, + "grad_norm": 1.4755134582519531, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8768899440765381, + "num_tokens": 234404107.0, + "step": 6428 + }, + { + "epoch": 1.1938718662952645, + "grad_norm": 1.5958149433135986, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.878216564655304, + "num_tokens": 234437744.0, + "step": 6429 + }, + { + "epoch": 1.1940575673166203, + "grad_norm": 1.9398554563522339, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8927426934242249, + "num_tokens": 234468179.0, + "step": 6430 + }, + { + "epoch": 1.1942432683379758, + "grad_norm": 1.7638628482818604, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8857728242874146, + "num_tokens": 234493203.0, + "step": 6431 + }, + { + "epoch": 1.1944289693593315, + "grad_norm": 1.7103123664855957, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8686607480049133, + "num_tokens": 234524163.0, + "step": 6432 + }, + { + "epoch": 1.194614670380687, + "grad_norm": 1.5719140768051147, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8679112195968628, + "num_tokens": 234564763.0, + "step": 6433 + }, + { + "epoch": 1.1948003714020428, + "grad_norm": 1.396957278251648, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8835238814353943, + "num_tokens": 234609304.0, + "step": 6434 + }, + { + "epoch": 1.1949860724233983, + "grad_norm": 1.6437569856643677, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.877498209476471, + "num_tokens": 234644579.0, + "step": 6435 + }, + { + "epoch": 1.195171773444754, + "grad_norm": 1.5175307989120483, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8896185159683228, + "num_tokens": 234679277.0, + "step": 6436 + }, + { + "epoch": 1.1953574744661095, + "grad_norm": 1.4565060138702393, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8803858757019043, + "num_tokens": 234717561.0, + "step": 6437 + }, + { + "epoch": 1.1955431754874652, + "grad_norm": 1.5324736833572388, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8780701160430908, + "num_tokens": 234755109.0, + "step": 6438 + }, + { + "epoch": 1.1957288765088208, + "grad_norm": 1.6106878519058228, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8656514883041382, + "num_tokens": 234793313.0, + "step": 6439 + }, + { + "epoch": 1.1959145775301765, + "grad_norm": 1.739555835723877, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8595527410507202, + "num_tokens": 234827373.0, + "step": 6440 + }, + { + "epoch": 1.196100278551532, + "grad_norm": 1.750123381614685, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8783725500106812, + "num_tokens": 234859309.0, + "step": 6441 + }, + { + "epoch": 1.1962859795728877, + "grad_norm": 1.6252371072769165, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8807849884033203, + "num_tokens": 234895830.0, + "step": 6442 + }, + { + "epoch": 1.1964716805942432, + "grad_norm": 1.6743139028549194, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8683565258979797, + "num_tokens": 234929241.0, + "step": 6443 + }, + { + "epoch": 1.196657381615599, + "grad_norm": 1.5604102611541748, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8758056163787842, + "num_tokens": 234963788.0, + "step": 6444 + }, + { + "epoch": 1.1968430826369545, + "grad_norm": 1.6005162000656128, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8586680889129639, + "num_tokens": 235000168.0, + "step": 6445 + }, + { + "epoch": 1.1970287836583102, + "grad_norm": 1.5905370712280273, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8693487048149109, + "num_tokens": 235037025.0, + "step": 6446 + }, + { + "epoch": 1.1972144846796657, + "grad_norm": 1.5663349628448486, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8603055477142334, + "num_tokens": 235076984.0, + "step": 6447 + }, + { + "epoch": 1.1974001857010212, + "grad_norm": 1.5284236669540405, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8776883482933044, + "num_tokens": 235117787.0, + "step": 6448 + }, + { + "epoch": 1.197585886722377, + "grad_norm": 1.5451672077178955, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8872270584106445, + "num_tokens": 235151542.0, + "step": 6449 + }, + { + "epoch": 1.1977715877437327, + "grad_norm": 1.4905792474746704, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8789526224136353, + "num_tokens": 235188635.0, + "step": 6450 + }, + { + "epoch": 1.1979572887650882, + "grad_norm": 1.5296380519866943, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8797153234481812, + "num_tokens": 235226253.0, + "step": 6451 + }, + { + "epoch": 1.1981429897864437, + "grad_norm": 1.5511202812194824, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8596867918968201, + "num_tokens": 235263892.0, + "step": 6452 + }, + { + "epoch": 1.1983286908077995, + "grad_norm": 1.6168521642684937, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8639836311340332, + "num_tokens": 235302618.0, + "step": 6453 + }, + { + "epoch": 1.198514391829155, + "grad_norm": 1.6207436323165894, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.870128870010376, + "num_tokens": 235339540.0, + "step": 6454 + }, + { + "epoch": 1.1987000928505107, + "grad_norm": 1.5971037149429321, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8823013305664062, + "num_tokens": 235370978.0, + "step": 6455 + }, + { + "epoch": 1.1988857938718662, + "grad_norm": 1.5746008157730103, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8664842247962952, + "num_tokens": 235408599.0, + "step": 6456 + }, + { + "epoch": 1.199071494893222, + "grad_norm": 1.4694300889968872, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8871059417724609, + "num_tokens": 235446320.0, + "step": 6457 + }, + { + "epoch": 1.1992571959145775, + "grad_norm": 1.4516167640686035, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8743374347686768, + "num_tokens": 235486641.0, + "step": 6458 + }, + { + "epoch": 1.1994428969359332, + "grad_norm": 1.5923124551773071, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8820961713790894, + "num_tokens": 235520783.0, + "step": 6459 + }, + { + "epoch": 1.1996285979572887, + "grad_norm": 1.608811616897583, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8942530751228333, + "num_tokens": 235551061.0, + "step": 6460 + }, + { + "epoch": 1.1998142989786444, + "grad_norm": 1.673217535018921, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8661103248596191, + "num_tokens": 235585226.0, + "step": 6461 + }, + { + "epoch": 1.2, + "grad_norm": 1.6016465425491333, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8656932711601257, + "num_tokens": 235623392.0, + "step": 6462 + }, + { + "epoch": 1.2001857010213557, + "grad_norm": 1.553742527961731, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8809527158737183, + "num_tokens": 235659124.0, + "step": 6463 + }, + { + "epoch": 1.2003714020427112, + "grad_norm": 1.5304834842681885, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8818112015724182, + "num_tokens": 235697135.0, + "step": 6464 + }, + { + "epoch": 1.200557103064067, + "grad_norm": 1.4790465831756592, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8837342262268066, + "num_tokens": 235735047.0, + "step": 6465 + }, + { + "epoch": 1.2007428040854224, + "grad_norm": 1.4704830646514893, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.861255407333374, + "num_tokens": 235778430.0, + "step": 6466 + }, + { + "epoch": 1.2009285051067782, + "grad_norm": 1.5033930540084839, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8732279539108276, + "num_tokens": 235820667.0, + "step": 6467 + }, + { + "epoch": 1.2011142061281337, + "grad_norm": 1.708269715309143, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8654016256332397, + "num_tokens": 235858851.0, + "step": 6468 + }, + { + "epoch": 1.2012999071494894, + "grad_norm": 1.6415987014770508, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8876002430915833, + "num_tokens": 235893574.0, + "step": 6469 + }, + { + "epoch": 1.201485608170845, + "grad_norm": 1.5036894083023071, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8799842596054077, + "num_tokens": 235930738.0, + "step": 6470 + }, + { + "epoch": 1.2016713091922004, + "grad_norm": 1.5131560564041138, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8823435306549072, + "num_tokens": 235969800.0, + "step": 6471 + }, + { + "epoch": 1.2018570102135562, + "grad_norm": 1.5033270120620728, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8685407638549805, + "num_tokens": 236010629.0, + "step": 6472 + }, + { + "epoch": 1.202042711234912, + "grad_norm": 1.5476540327072144, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8735370635986328, + "num_tokens": 236048836.0, + "step": 6473 + }, + { + "epoch": 1.2022284122562674, + "grad_norm": 1.483172059059143, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8776528835296631, + "num_tokens": 236089777.0, + "step": 6474 + }, + { + "epoch": 1.202414113277623, + "grad_norm": 1.7397361993789673, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8602496385574341, + "num_tokens": 236121719.0, + "step": 6475 + }, + { + "epoch": 1.2025998142989787, + "grad_norm": 1.6830120086669922, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8773003816604614, + "num_tokens": 236154045.0, + "step": 6476 + }, + { + "epoch": 1.2027855153203342, + "grad_norm": 1.5567505359649658, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.871143102645874, + "num_tokens": 236189031.0, + "step": 6477 + }, + { + "epoch": 1.20297121634169, + "grad_norm": 1.513530969619751, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8607600331306458, + "num_tokens": 236229443.0, + "step": 6478 + }, + { + "epoch": 1.2031569173630454, + "grad_norm": 1.661376714706421, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8775508999824524, + "num_tokens": 236259677.0, + "step": 6479 + }, + { + "epoch": 1.2033426183844012, + "grad_norm": 1.5617295503616333, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8725167512893677, + "num_tokens": 236293809.0, + "step": 6480 + }, + { + "epoch": 1.2035283194057567, + "grad_norm": 1.5220688581466675, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8913196325302124, + "num_tokens": 236329993.0, + "step": 6481 + }, + { + "epoch": 1.2037140204271124, + "grad_norm": 1.5279556512832642, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8834400177001953, + "num_tokens": 236366226.0, + "step": 6482 + }, + { + "epoch": 1.203899721448468, + "grad_norm": 1.5812491178512573, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8653386831283569, + "num_tokens": 236402924.0, + "step": 6483 + }, + { + "epoch": 1.2040854224698236, + "grad_norm": 1.539933681488037, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8832221031188965, + "num_tokens": 236438297.0, + "step": 6484 + }, + { + "epoch": 1.2042711234911792, + "grad_norm": 1.7864214181900024, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8668149709701538, + "num_tokens": 236468191.0, + "step": 6485 + }, + { + "epoch": 1.2044568245125349, + "grad_norm": 1.5575565099716187, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8883422613143921, + "num_tokens": 236501324.0, + "step": 6486 + }, + { + "epoch": 1.2046425255338904, + "grad_norm": 2.0207948684692383, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8640876412391663, + "num_tokens": 236527398.0, + "step": 6487 + }, + { + "epoch": 1.2048282265552461, + "grad_norm": 1.6004804372787476, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8842735290527344, + "num_tokens": 236562418.0, + "step": 6488 + }, + { + "epoch": 1.2050139275766016, + "grad_norm": 1.5968565940856934, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8735994100570679, + "num_tokens": 236601574.0, + "step": 6489 + }, + { + "epoch": 1.2051996285979574, + "grad_norm": 1.6053048372268677, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8671948909759521, + "num_tokens": 236642024.0, + "step": 6490 + }, + { + "epoch": 1.2053853296193129, + "grad_norm": 1.4487454891204834, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8793774247169495, + "num_tokens": 236682358.0, + "step": 6491 + }, + { + "epoch": 1.2055710306406686, + "grad_norm": 1.425601840019226, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8800392746925354, + "num_tokens": 236722159.0, + "step": 6492 + }, + { + "epoch": 1.2057567316620241, + "grad_norm": 1.7999151945114136, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8743289709091187, + "num_tokens": 236754040.0, + "step": 6493 + }, + { + "epoch": 1.2059424326833796, + "grad_norm": 1.7320142984390259, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8630185127258301, + "num_tokens": 236790578.0, + "step": 6494 + }, + { + "epoch": 1.2061281337047354, + "grad_norm": 1.6654620170593262, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8667151927947998, + "num_tokens": 236823928.0, + "step": 6495 + }, + { + "epoch": 1.206313834726091, + "grad_norm": 1.5063800811767578, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8745903968811035, + "num_tokens": 236863768.0, + "step": 6496 + }, + { + "epoch": 1.2064995357474466, + "grad_norm": 1.529821753501892, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8670167922973633, + "num_tokens": 236900600.0, + "step": 6497 + }, + { + "epoch": 1.2066852367688021, + "grad_norm": 1.4628641605377197, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8914011716842651, + "num_tokens": 236934912.0, + "step": 6498 + }, + { + "epoch": 1.2068709377901579, + "grad_norm": 1.4497946500778198, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8794975876808167, + "num_tokens": 236972921.0, + "step": 6499 + }, + { + "epoch": 1.2070566388115134, + "grad_norm": 1.5605086088180542, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8779628276824951, + "num_tokens": 237010291.0, + "step": 6500 + }, + { + "epoch": 1.207242339832869, + "grad_norm": 1.6216222047805786, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8830816745758057, + "num_tokens": 237043658.0, + "step": 6501 + }, + { + "epoch": 1.2074280408542246, + "grad_norm": 1.5041899681091309, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8855806589126587, + "num_tokens": 237081673.0, + "step": 6502 + }, + { + "epoch": 1.2076137418755803, + "grad_norm": 1.5029499530792236, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8764656186103821, + "num_tokens": 237121432.0, + "step": 6503 + }, + { + "epoch": 1.2077994428969359, + "grad_norm": 1.51998770236969, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8829305768013, + "num_tokens": 237159474.0, + "step": 6504 + }, + { + "epoch": 1.2079851439182916, + "grad_norm": 1.4999825954437256, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8810205459594727, + "num_tokens": 237195482.0, + "step": 6505 + }, + { + "epoch": 1.208170844939647, + "grad_norm": 1.5978870391845703, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8832502365112305, + "num_tokens": 237231318.0, + "step": 6506 + }, + { + "epoch": 1.2083565459610028, + "grad_norm": 1.5568506717681885, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8802167177200317, + "num_tokens": 237267891.0, + "step": 6507 + }, + { + "epoch": 1.2085422469823583, + "grad_norm": 1.5582510232925415, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8544836044311523, + "num_tokens": 237313407.0, + "step": 6508 + }, + { + "epoch": 1.208727948003714, + "grad_norm": 1.4895238876342773, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8738845586776733, + "num_tokens": 237351202.0, + "step": 6509 + }, + { + "epoch": 1.2089136490250696, + "grad_norm": 1.488199234008789, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8719915151596069, + "num_tokens": 237390709.0, + "step": 6510 + }, + { + "epoch": 1.2090993500464253, + "grad_norm": 1.48484468460083, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8857172727584839, + "num_tokens": 237425431.0, + "step": 6511 + }, + { + "epoch": 1.2092850510677808, + "grad_norm": 1.5469051599502563, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8736302852630615, + "num_tokens": 237461649.0, + "step": 6512 + }, + { + "epoch": 1.2094707520891366, + "grad_norm": 1.6392625570297241, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8765477538108826, + "num_tokens": 237495088.0, + "step": 6513 + }, + { + "epoch": 1.209656453110492, + "grad_norm": 1.6075365543365479, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8742064237594604, + "num_tokens": 237534062.0, + "step": 6514 + }, + { + "epoch": 1.2098421541318478, + "grad_norm": 1.4227694272994995, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8694812059402466, + "num_tokens": 237576699.0, + "step": 6515 + }, + { + "epoch": 1.2100278551532033, + "grad_norm": 1.5193973779678345, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8725777864456177, + "num_tokens": 237613767.0, + "step": 6516 + }, + { + "epoch": 1.2102135561745588, + "grad_norm": 1.6991641521453857, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8619182109832764, + "num_tokens": 237652927.0, + "step": 6517 + }, + { + "epoch": 1.2103992571959146, + "grad_norm": 1.5197209119796753, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8916084170341492, + "num_tokens": 237687087.0, + "step": 6518 + }, + { + "epoch": 1.2105849582172703, + "grad_norm": 2.0487139225006104, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8783930540084839, + "num_tokens": 237723205.0, + "step": 6519 + }, + { + "epoch": 1.2107706592386258, + "grad_norm": 1.5530595779418945, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8741238117218018, + "num_tokens": 237759218.0, + "step": 6520 + }, + { + "epoch": 1.2109563602599813, + "grad_norm": 1.509788990020752, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8814753293991089, + "num_tokens": 237795364.0, + "step": 6521 + }, + { + "epoch": 1.211142061281337, + "grad_norm": 1.7177891731262207, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8890328407287598, + "num_tokens": 237824629.0, + "step": 6522 + }, + { + "epoch": 1.2113277623026926, + "grad_norm": 1.5772382020950317, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8675408959388733, + "num_tokens": 237861741.0, + "step": 6523 + }, + { + "epoch": 1.2115134633240483, + "grad_norm": 1.611696720123291, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8946027755737305, + "num_tokens": 237890927.0, + "step": 6524 + }, + { + "epoch": 1.2116991643454038, + "grad_norm": 1.8047153949737549, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8706449866294861, + "num_tokens": 237918426.0, + "step": 6525 + }, + { + "epoch": 1.2118848653667595, + "grad_norm": 1.5963773727416992, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8804565668106079, + "num_tokens": 237950219.0, + "step": 6526 + }, + { + "epoch": 1.212070566388115, + "grad_norm": 1.5989726781845093, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.876795768737793, + "num_tokens": 237984386.0, + "step": 6527 + }, + { + "epoch": 1.2122562674094708, + "grad_norm": 1.5827025175094604, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8785253763198853, + "num_tokens": 238027221.0, + "step": 6528 + }, + { + "epoch": 1.2124419684308263, + "grad_norm": 1.7574604749679565, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8900951147079468, + "num_tokens": 238054586.0, + "step": 6529 + }, + { + "epoch": 1.212627669452182, + "grad_norm": 1.5042047500610352, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8670728206634521, + "num_tokens": 238093139.0, + "step": 6530 + }, + { + "epoch": 1.2128133704735375, + "grad_norm": 1.6365426778793335, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8737885355949402, + "num_tokens": 238128409.0, + "step": 6531 + }, + { + "epoch": 1.2129990714948933, + "grad_norm": 1.7507840394973755, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8711369037628174, + "num_tokens": 238164369.0, + "step": 6532 + }, + { + "epoch": 1.2131847725162488, + "grad_norm": 1.867477297782898, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8497370481491089, + "num_tokens": 238195712.0, + "step": 6533 + }, + { + "epoch": 1.2133704735376045, + "grad_norm": 1.6322320699691772, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8848508596420288, + "num_tokens": 238232386.0, + "step": 6534 + }, + { + "epoch": 1.21355617455896, + "grad_norm": 1.636883020401001, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8708181381225586, + "num_tokens": 238268691.0, + "step": 6535 + }, + { + "epoch": 1.2137418755803158, + "grad_norm": 1.5210858583450317, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8735669851303101, + "num_tokens": 238306967.0, + "step": 6536 + }, + { + "epoch": 1.2139275766016713, + "grad_norm": 1.5876070261001587, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8877888321876526, + "num_tokens": 238343921.0, + "step": 6537 + }, + { + "epoch": 1.214113277623027, + "grad_norm": 1.6053928136825562, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8820385932922363, + "num_tokens": 238377246.0, + "step": 6538 + }, + { + "epoch": 1.2142989786443825, + "grad_norm": 1.6824716329574585, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8775455951690674, + "num_tokens": 238411618.0, + "step": 6539 + }, + { + "epoch": 1.2144846796657383, + "grad_norm": 1.5449522733688354, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8721426725387573, + "num_tokens": 238448001.0, + "step": 6540 + }, + { + "epoch": 1.2146703806870938, + "grad_norm": 1.6653484106063843, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8644180297851562, + "num_tokens": 238482755.0, + "step": 6541 + }, + { + "epoch": 1.2148560817084495, + "grad_norm": 1.545795202255249, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8836228251457214, + "num_tokens": 238517379.0, + "step": 6542 + }, + { + "epoch": 1.215041782729805, + "grad_norm": 1.5928444862365723, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8637138605117798, + "num_tokens": 238554809.0, + "step": 6543 + }, + { + "epoch": 1.2152274837511605, + "grad_norm": 1.4820153713226318, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8620707988739014, + "num_tokens": 238597396.0, + "step": 6544 + }, + { + "epoch": 1.2154131847725163, + "grad_norm": 1.5969442129135132, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8833919763565063, + "num_tokens": 238631898.0, + "step": 6545 + }, + { + "epoch": 1.215598885793872, + "grad_norm": 1.5604981184005737, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8939001560211182, + "num_tokens": 238667786.0, + "step": 6546 + }, + { + "epoch": 1.2157845868152275, + "grad_norm": 1.6032363176345825, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8632973432540894, + "num_tokens": 238703647.0, + "step": 6547 + }, + { + "epoch": 1.215970287836583, + "grad_norm": 1.7519947290420532, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8834460973739624, + "num_tokens": 238732750.0, + "step": 6548 + }, + { + "epoch": 1.2161559888579387, + "grad_norm": 1.7775026559829712, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.869150698184967, + "num_tokens": 238766256.0, + "step": 6549 + }, + { + "epoch": 1.2163416898792943, + "grad_norm": 1.4122463464736938, + "learning_rate": 1e-06, + "loss": 0.2577, + "mean_token_accuracy": 0.9065338969230652, + "num_tokens": 238800511.0, + "step": 6550 + }, + { + "epoch": 1.21652739090065, + "grad_norm": 1.8332099914550781, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8562802076339722, + "num_tokens": 238831023.0, + "step": 6551 + }, + { + "epoch": 1.2167130919220055, + "grad_norm": 1.8421542644500732, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8691971898078918, + "num_tokens": 238859829.0, + "step": 6552 + }, + { + "epoch": 1.2168987929433612, + "grad_norm": 1.6246061325073242, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8673027157783508, + "num_tokens": 238898412.0, + "step": 6553 + }, + { + "epoch": 1.2170844939647167, + "grad_norm": 1.4532506465911865, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8874632716178894, + "num_tokens": 238940421.0, + "step": 6554 + }, + { + "epoch": 1.2172701949860725, + "grad_norm": 1.4359995126724243, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8849852085113525, + "num_tokens": 238978763.0, + "step": 6555 + }, + { + "epoch": 1.217455896007428, + "grad_norm": 1.5977082252502441, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.864950954914093, + "num_tokens": 239014092.0, + "step": 6556 + }, + { + "epoch": 1.2176415970287837, + "grad_norm": 1.4982300996780396, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8776341676712036, + "num_tokens": 239051772.0, + "step": 6557 + }, + { + "epoch": 1.2178272980501392, + "grad_norm": 1.6603726148605347, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8658655881881714, + "num_tokens": 239087443.0, + "step": 6558 + }, + { + "epoch": 1.218012999071495, + "grad_norm": 1.433619499206543, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8837592601776123, + "num_tokens": 239128795.0, + "step": 6559 + }, + { + "epoch": 1.2181987000928505, + "grad_norm": 1.6418582201004028, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8671215772628784, + "num_tokens": 239166946.0, + "step": 6560 + }, + { + "epoch": 1.2183844011142062, + "grad_norm": 1.7668578624725342, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8703830242156982, + "num_tokens": 239197935.0, + "step": 6561 + }, + { + "epoch": 1.2185701021355617, + "grad_norm": 1.5554187297821045, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8861715793609619, + "num_tokens": 239232485.0, + "step": 6562 + }, + { + "epoch": 1.2187558031569174, + "grad_norm": 1.5991171598434448, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8831866979598999, + "num_tokens": 239266682.0, + "step": 6563 + }, + { + "epoch": 1.218941504178273, + "grad_norm": 1.687627911567688, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8734235167503357, + "num_tokens": 239299034.0, + "step": 6564 + }, + { + "epoch": 1.2191272051996287, + "grad_norm": 1.4657317399978638, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8753555417060852, + "num_tokens": 239340429.0, + "step": 6565 + }, + { + "epoch": 1.2193129062209842, + "grad_norm": 1.6751097440719604, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8653361797332764, + "num_tokens": 239373427.0, + "step": 6566 + }, + { + "epoch": 1.2194986072423397, + "grad_norm": 1.424181342124939, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8936601877212524, + "num_tokens": 239409317.0, + "step": 6567 + }, + { + "epoch": 1.2196843082636954, + "grad_norm": 1.5304266214370728, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8766149282455444, + "num_tokens": 239446974.0, + "step": 6568 + }, + { + "epoch": 1.2198700092850512, + "grad_norm": 1.4397252798080444, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8852086663246155, + "num_tokens": 239484600.0, + "step": 6569 + }, + { + "epoch": 1.2200557103064067, + "grad_norm": 1.53890860080719, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8616498708724976, + "num_tokens": 239520628.0, + "step": 6570 + }, + { + "epoch": 1.2202414113277622, + "grad_norm": 1.5162981748580933, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8918243646621704, + "num_tokens": 239557655.0, + "step": 6571 + }, + { + "epoch": 1.220427112349118, + "grad_norm": 1.4896703958511353, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8771162033081055, + "num_tokens": 239599870.0, + "step": 6572 + }, + { + "epoch": 1.2206128133704734, + "grad_norm": 1.9403992891311646, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8667411804199219, + "num_tokens": 239631333.0, + "step": 6573 + }, + { + "epoch": 1.2207985143918292, + "grad_norm": 1.5675874948501587, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8618053197860718, + "num_tokens": 239669628.0, + "step": 6574 + }, + { + "epoch": 1.2209842154131847, + "grad_norm": 1.820068120956421, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8602704405784607, + "num_tokens": 239706389.0, + "step": 6575 + }, + { + "epoch": 1.2211699164345404, + "grad_norm": 1.6189749240875244, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8689242601394653, + "num_tokens": 239741454.0, + "step": 6576 + }, + { + "epoch": 1.221355617455896, + "grad_norm": 1.5062718391418457, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8699831366539001, + "num_tokens": 239781612.0, + "step": 6577 + }, + { + "epoch": 1.2215413184772517, + "grad_norm": 1.5412293672561646, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.870347261428833, + "num_tokens": 239820005.0, + "step": 6578 + }, + { + "epoch": 1.2217270194986072, + "grad_norm": 1.7870525121688843, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8821076154708862, + "num_tokens": 239850419.0, + "step": 6579 + }, + { + "epoch": 1.221912720519963, + "grad_norm": 1.5952274799346924, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8798890113830566, + "num_tokens": 239885095.0, + "step": 6580 + }, + { + "epoch": 1.2220984215413184, + "grad_norm": 1.4869805574417114, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8762897253036499, + "num_tokens": 239924123.0, + "step": 6581 + }, + { + "epoch": 1.2222841225626742, + "grad_norm": 1.5038385391235352, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8823856711387634, + "num_tokens": 239962543.0, + "step": 6582 + }, + { + "epoch": 1.2224698235840297, + "grad_norm": 1.5664345026016235, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8746575713157654, + "num_tokens": 240002001.0, + "step": 6583 + }, + { + "epoch": 1.2226555246053854, + "grad_norm": 1.4011205434799194, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8820300698280334, + "num_tokens": 240043716.0, + "step": 6584 + }, + { + "epoch": 1.222841225626741, + "grad_norm": 1.6266006231307983, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8787504434585571, + "num_tokens": 240079051.0, + "step": 6585 + }, + { + "epoch": 1.2230269266480966, + "grad_norm": 1.531652569770813, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8801544904708862, + "num_tokens": 240114342.0, + "step": 6586 + }, + { + "epoch": 1.2232126276694522, + "grad_norm": 1.497362732887268, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8768100142478943, + "num_tokens": 240153892.0, + "step": 6587 + }, + { + "epoch": 1.223398328690808, + "grad_norm": 1.4473987817764282, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8793175220489502, + "num_tokens": 240196630.0, + "step": 6588 + }, + { + "epoch": 1.2235840297121634, + "grad_norm": 1.5557388067245483, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8802894353866577, + "num_tokens": 240231921.0, + "step": 6589 + }, + { + "epoch": 1.223769730733519, + "grad_norm": 1.4092462062835693, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8867052793502808, + "num_tokens": 240271113.0, + "step": 6590 + }, + { + "epoch": 1.2239554317548746, + "grad_norm": 1.560856819152832, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8883304595947266, + "num_tokens": 240307324.0, + "step": 6591 + }, + { + "epoch": 1.2241411327762304, + "grad_norm": 1.5125174522399902, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8739300966262817, + "num_tokens": 240347060.0, + "step": 6592 + }, + { + "epoch": 1.224326833797586, + "grad_norm": 1.528554081916809, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8848111629486084, + "num_tokens": 240382763.0, + "step": 6593 + }, + { + "epoch": 1.2245125348189414, + "grad_norm": 1.5542858839035034, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8846498727798462, + "num_tokens": 240416073.0, + "step": 6594 + }, + { + "epoch": 1.2246982358402971, + "grad_norm": 1.551062822341919, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8757781386375427, + "num_tokens": 240453603.0, + "step": 6595 + }, + { + "epoch": 1.2248839368616526, + "grad_norm": 1.5436360836029053, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8771976232528687, + "num_tokens": 240490053.0, + "step": 6596 + }, + { + "epoch": 1.2250696378830084, + "grad_norm": 1.6099187135696411, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.87473464012146, + "num_tokens": 240524377.0, + "step": 6597 + }, + { + "epoch": 1.2252553389043639, + "grad_norm": 1.362004041671753, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8863540887832642, + "num_tokens": 240568666.0, + "step": 6598 + }, + { + "epoch": 1.2254410399257196, + "grad_norm": 1.5327746868133545, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8594611883163452, + "num_tokens": 240608804.0, + "step": 6599 + }, + { + "epoch": 1.2256267409470751, + "grad_norm": 1.5554853677749634, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8796278834342957, + "num_tokens": 240646479.0, + "step": 6600 + }, + { + "epoch": 1.2258124419684309, + "grad_norm": 1.5423353910446167, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8695781230926514, + "num_tokens": 240684649.0, + "step": 6601 + }, + { + "epoch": 1.2259981429897864, + "grad_norm": 1.4141188859939575, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8893985152244568, + "num_tokens": 240725525.0, + "step": 6602 + }, + { + "epoch": 1.226183844011142, + "grad_norm": 1.470348834991455, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8922632932662964, + "num_tokens": 240760188.0, + "step": 6603 + }, + { + "epoch": 1.2263695450324976, + "grad_norm": 1.619852900505066, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.874157190322876, + "num_tokens": 240793803.0, + "step": 6604 + }, + { + "epoch": 1.2265552460538534, + "grad_norm": 1.4716558456420898, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8687081336975098, + "num_tokens": 240837733.0, + "step": 6605 + }, + { + "epoch": 1.2267409470752089, + "grad_norm": 1.4062148332595825, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8784497976303101, + "num_tokens": 240879180.0, + "step": 6606 + }, + { + "epoch": 1.2269266480965646, + "grad_norm": 1.5168845653533936, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8716490268707275, + "num_tokens": 240917046.0, + "step": 6607 + }, + { + "epoch": 1.22711234911792, + "grad_norm": 1.5316076278686523, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.880351722240448, + "num_tokens": 240952726.0, + "step": 6608 + }, + { + "epoch": 1.2272980501392758, + "grad_norm": 1.5693082809448242, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8836576342582703, + "num_tokens": 240986798.0, + "step": 6609 + }, + { + "epoch": 1.2274837511606314, + "grad_norm": 1.5639164447784424, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8723919987678528, + "num_tokens": 241022509.0, + "step": 6610 + }, + { + "epoch": 1.227669452181987, + "grad_norm": 1.5402144193649292, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8814904093742371, + "num_tokens": 241059816.0, + "step": 6611 + }, + { + "epoch": 1.2278551532033426, + "grad_norm": 1.5085500478744507, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8675479888916016, + "num_tokens": 241102418.0, + "step": 6612 + }, + { + "epoch": 1.2280408542246983, + "grad_norm": 1.5026010274887085, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8938122987747192, + "num_tokens": 241135619.0, + "step": 6613 + }, + { + "epoch": 1.2282265552460538, + "grad_norm": 1.611199140548706, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8720016479492188, + "num_tokens": 241169473.0, + "step": 6614 + }, + { + "epoch": 1.2284122562674096, + "grad_norm": 1.4733320474624634, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8868053555488586, + "num_tokens": 241206891.0, + "step": 6615 + }, + { + "epoch": 1.228597957288765, + "grad_norm": 1.5116158723831177, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8894646167755127, + "num_tokens": 241241810.0, + "step": 6616 + }, + { + "epoch": 1.2287836583101206, + "grad_norm": 1.4825190305709839, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8951796293258667, + "num_tokens": 241277966.0, + "step": 6617 + }, + { + "epoch": 1.2289693593314763, + "grad_norm": 1.5227737426757812, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8914883732795715, + "num_tokens": 241311131.0, + "step": 6618 + }, + { + "epoch": 1.229155060352832, + "grad_norm": 1.448300838470459, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8762788772583008, + "num_tokens": 241349747.0, + "step": 6619 + }, + { + "epoch": 1.2293407613741876, + "grad_norm": 1.6404763460159302, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8860735893249512, + "num_tokens": 241382854.0, + "step": 6620 + }, + { + "epoch": 1.229526462395543, + "grad_norm": 1.6446236371994019, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8577568531036377, + "num_tokens": 241416267.0, + "step": 6621 + }, + { + "epoch": 1.2297121634168988, + "grad_norm": 1.6577547788619995, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.873065173625946, + "num_tokens": 241450248.0, + "step": 6622 + }, + { + "epoch": 1.2298978644382543, + "grad_norm": 1.3920313119888306, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8736429214477539, + "num_tokens": 241495295.0, + "step": 6623 + }, + { + "epoch": 1.23008356545961, + "grad_norm": 1.8724966049194336, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8679964542388916, + "num_tokens": 241523609.0, + "step": 6624 + }, + { + "epoch": 1.2302692664809656, + "grad_norm": 1.7329461574554443, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8698196411132812, + "num_tokens": 241553899.0, + "step": 6625 + }, + { + "epoch": 1.2304549675023213, + "grad_norm": 1.4556466341018677, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8861813545227051, + "num_tokens": 241590508.0, + "step": 6626 + }, + { + "epoch": 1.2306406685236768, + "grad_norm": 1.5402483940124512, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.869446337223053, + "num_tokens": 241628307.0, + "step": 6627 + }, + { + "epoch": 1.2308263695450326, + "grad_norm": 1.6568810939788818, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8735936880111694, + "num_tokens": 241662018.0, + "step": 6628 + }, + { + "epoch": 1.231012070566388, + "grad_norm": 1.4998191595077515, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8855907917022705, + "num_tokens": 241701129.0, + "step": 6629 + }, + { + "epoch": 1.2311977715877438, + "grad_norm": 1.6763088703155518, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8753101825714111, + "num_tokens": 241737887.0, + "step": 6630 + }, + { + "epoch": 1.2313834726090993, + "grad_norm": 1.6093353033065796, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8805912733078003, + "num_tokens": 241773020.0, + "step": 6631 + }, + { + "epoch": 1.231569173630455, + "grad_norm": 1.5711326599121094, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.876092255115509, + "num_tokens": 241807067.0, + "step": 6632 + }, + { + "epoch": 1.2317548746518105, + "grad_norm": 1.564699411392212, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8798062205314636, + "num_tokens": 241842248.0, + "step": 6633 + }, + { + "epoch": 1.2319405756731663, + "grad_norm": 1.7595841884613037, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8783105611801147, + "num_tokens": 241870349.0, + "step": 6634 + }, + { + "epoch": 1.2321262766945218, + "grad_norm": 1.5157874822616577, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8779181241989136, + "num_tokens": 241908940.0, + "step": 6635 + }, + { + "epoch": 1.2323119777158775, + "grad_norm": 1.5920675992965698, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8720237016677856, + "num_tokens": 241948546.0, + "step": 6636 + }, + { + "epoch": 1.232497678737233, + "grad_norm": 1.648057222366333, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.872624397277832, + "num_tokens": 241983608.0, + "step": 6637 + }, + { + "epoch": 1.2326833797585888, + "grad_norm": 1.6131818294525146, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8695703744888306, + "num_tokens": 242022166.0, + "step": 6638 + }, + { + "epoch": 1.2328690807799443, + "grad_norm": 1.6504502296447754, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8713719844818115, + "num_tokens": 242054377.0, + "step": 6639 + }, + { + "epoch": 1.2330547818012998, + "grad_norm": 1.5247454643249512, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8885398507118225, + "num_tokens": 242089163.0, + "step": 6640 + }, + { + "epoch": 1.2332404828226555, + "grad_norm": 1.4530919790267944, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8793072700500488, + "num_tokens": 242127676.0, + "step": 6641 + }, + { + "epoch": 1.2334261838440113, + "grad_norm": 1.7063167095184326, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8713057041168213, + "num_tokens": 242160176.0, + "step": 6642 + }, + { + "epoch": 1.2336118848653668, + "grad_norm": 1.5775028467178345, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8666557669639587, + "num_tokens": 242195484.0, + "step": 6643 + }, + { + "epoch": 1.2337975858867223, + "grad_norm": 1.5429644584655762, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8640958666801453, + "num_tokens": 242235399.0, + "step": 6644 + }, + { + "epoch": 1.233983286908078, + "grad_norm": 1.5168583393096924, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8854440450668335, + "num_tokens": 242271908.0, + "step": 6645 + }, + { + "epoch": 1.2341689879294335, + "grad_norm": 1.4666023254394531, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8795080184936523, + "num_tokens": 242310296.0, + "step": 6646 + }, + { + "epoch": 1.2343546889507893, + "grad_norm": 1.5215054750442505, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8833429217338562, + "num_tokens": 242344236.0, + "step": 6647 + }, + { + "epoch": 1.2345403899721448, + "grad_norm": 1.5079076290130615, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8846571445465088, + "num_tokens": 242380207.0, + "step": 6648 + }, + { + "epoch": 1.2347260909935005, + "grad_norm": 1.5614211559295654, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8729898929595947, + "num_tokens": 242416308.0, + "step": 6649 + }, + { + "epoch": 1.234911792014856, + "grad_norm": 1.5289572477340698, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8762553930282593, + "num_tokens": 242453065.0, + "step": 6650 + }, + { + "epoch": 1.2350974930362117, + "grad_norm": 1.742611289024353, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8588092923164368, + "num_tokens": 242486037.0, + "step": 6651 + }, + { + "epoch": 1.2352831940575673, + "grad_norm": 1.601806879043579, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.873475193977356, + "num_tokens": 242522568.0, + "step": 6652 + }, + { + "epoch": 1.235468895078923, + "grad_norm": 1.490286111831665, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8682002425193787, + "num_tokens": 242561458.0, + "step": 6653 + }, + { + "epoch": 1.2356545961002785, + "grad_norm": 1.5624674558639526, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8760871887207031, + "num_tokens": 242596183.0, + "step": 6654 + }, + { + "epoch": 1.2358402971216342, + "grad_norm": 1.536049246788025, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8840687274932861, + "num_tokens": 242629590.0, + "step": 6655 + }, + { + "epoch": 1.2360259981429897, + "grad_norm": 1.5551021099090576, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8783377408981323, + "num_tokens": 242665566.0, + "step": 6656 + }, + { + "epoch": 1.2362116991643455, + "grad_norm": 1.4211702346801758, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8873476982116699, + "num_tokens": 242707873.0, + "step": 6657 + }, + { + "epoch": 1.236397400185701, + "grad_norm": 1.5344842672348022, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8624318838119507, + "num_tokens": 242747250.0, + "step": 6658 + }, + { + "epoch": 1.2365831012070567, + "grad_norm": 1.552842617034912, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8803653120994568, + "num_tokens": 242783900.0, + "step": 6659 + }, + { + "epoch": 1.2367688022284122, + "grad_norm": 1.5281988382339478, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.879605233669281, + "num_tokens": 242819410.0, + "step": 6660 + }, + { + "epoch": 1.236954503249768, + "grad_norm": 1.5059053897857666, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8729194402694702, + "num_tokens": 242855515.0, + "step": 6661 + }, + { + "epoch": 1.2371402042711235, + "grad_norm": 1.4849821329116821, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8763574361801147, + "num_tokens": 242892283.0, + "step": 6662 + }, + { + "epoch": 1.237325905292479, + "grad_norm": 1.5906472206115723, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8779181241989136, + "num_tokens": 242927179.0, + "step": 6663 + }, + { + "epoch": 1.2375116063138347, + "grad_norm": 1.8105907440185547, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8651173710823059, + "num_tokens": 242957421.0, + "step": 6664 + }, + { + "epoch": 1.2376973073351905, + "grad_norm": 1.9312138557434082, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8606597781181335, + "num_tokens": 242985206.0, + "step": 6665 + }, + { + "epoch": 1.237883008356546, + "grad_norm": 1.7195801734924316, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8748563528060913, + "num_tokens": 243016548.0, + "step": 6666 + }, + { + "epoch": 1.2380687093779015, + "grad_norm": 1.5888203382492065, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8823158740997314, + "num_tokens": 243050109.0, + "step": 6667 + }, + { + "epoch": 1.2382544103992572, + "grad_norm": 1.5369873046875, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8804500102996826, + "num_tokens": 243087481.0, + "step": 6668 + }, + { + "epoch": 1.2384401114206127, + "grad_norm": 1.554466962814331, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8892488479614258, + "num_tokens": 243122311.0, + "step": 6669 + }, + { + "epoch": 1.2386258124419685, + "grad_norm": 1.4659128189086914, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8781403303146362, + "num_tokens": 243161502.0, + "step": 6670 + }, + { + "epoch": 1.238811513463324, + "grad_norm": 1.6270532608032227, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8769270181655884, + "num_tokens": 243195937.0, + "step": 6671 + }, + { + "epoch": 1.2389972144846797, + "grad_norm": 1.681100606918335, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8801968097686768, + "num_tokens": 243227070.0, + "step": 6672 + }, + { + "epoch": 1.2391829155060352, + "grad_norm": 1.693416953086853, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8612990379333496, + "num_tokens": 243260613.0, + "step": 6673 + }, + { + "epoch": 1.239368616527391, + "grad_norm": 1.574018120765686, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8696887493133545, + "num_tokens": 243298726.0, + "step": 6674 + }, + { + "epoch": 1.2395543175487465, + "grad_norm": 1.5636354684829712, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8649865388870239, + "num_tokens": 243338492.0, + "step": 6675 + }, + { + "epoch": 1.2397400185701022, + "grad_norm": 1.4696028232574463, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8855047821998596, + "num_tokens": 243375413.0, + "step": 6676 + }, + { + "epoch": 1.2399257195914577, + "grad_norm": 1.5681763887405396, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8613977432250977, + "num_tokens": 243412925.0, + "step": 6677 + }, + { + "epoch": 1.2401114206128134, + "grad_norm": 1.566023349761963, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8671588897705078, + "num_tokens": 243453071.0, + "step": 6678 + }, + { + "epoch": 1.240297121634169, + "grad_norm": 1.4908262491226196, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.884066104888916, + "num_tokens": 243488082.0, + "step": 6679 + }, + { + "epoch": 1.2404828226555247, + "grad_norm": 1.542487621307373, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8797011375427246, + "num_tokens": 243522756.0, + "step": 6680 + }, + { + "epoch": 1.2406685236768802, + "grad_norm": 1.6597591638565063, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8722467422485352, + "num_tokens": 243555279.0, + "step": 6681 + }, + { + "epoch": 1.240854224698236, + "grad_norm": 1.5852106809616089, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8801307678222656, + "num_tokens": 243590255.0, + "step": 6682 + }, + { + "epoch": 1.2410399257195914, + "grad_norm": 1.5384968519210815, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8715289235115051, + "num_tokens": 243626500.0, + "step": 6683 + }, + { + "epoch": 1.2412256267409472, + "grad_norm": 1.6615748405456543, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8700656890869141, + "num_tokens": 243657603.0, + "step": 6684 + }, + { + "epoch": 1.2414113277623027, + "grad_norm": 1.5288152694702148, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8748518824577332, + "num_tokens": 243699437.0, + "step": 6685 + }, + { + "epoch": 1.2415970287836582, + "grad_norm": 1.6018645763397217, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8824836015701294, + "num_tokens": 243733406.0, + "step": 6686 + }, + { + "epoch": 1.241782729805014, + "grad_norm": 1.4726964235305786, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8674838542938232, + "num_tokens": 243774926.0, + "step": 6687 + }, + { + "epoch": 1.2419684308263697, + "grad_norm": 1.6561856269836426, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.878840446472168, + "num_tokens": 243811678.0, + "step": 6688 + }, + { + "epoch": 1.2421541318477252, + "grad_norm": 1.615411400794983, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8774237632751465, + "num_tokens": 243844197.0, + "step": 6689 + }, + { + "epoch": 1.2423398328690807, + "grad_norm": 1.5146987438201904, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8882858753204346, + "num_tokens": 243879637.0, + "step": 6690 + }, + { + "epoch": 1.2425255338904364, + "grad_norm": 1.65534508228302, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8614544868469238, + "num_tokens": 243916626.0, + "step": 6691 + }, + { + "epoch": 1.2427112349117921, + "grad_norm": 1.4558275938034058, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8854138255119324, + "num_tokens": 243955319.0, + "step": 6692 + }, + { + "epoch": 1.2428969359331477, + "grad_norm": 1.568015694618225, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8806600570678711, + "num_tokens": 243992579.0, + "step": 6693 + }, + { + "epoch": 1.2430826369545032, + "grad_norm": 1.5266177654266357, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.887511670589447, + "num_tokens": 244030451.0, + "step": 6694 + }, + { + "epoch": 1.243268337975859, + "grad_norm": 1.5569994449615479, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8775879144668579, + "num_tokens": 244065737.0, + "step": 6695 + }, + { + "epoch": 1.2434540389972144, + "grad_norm": 1.5566132068634033, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8919336795806885, + "num_tokens": 244095731.0, + "step": 6696 + }, + { + "epoch": 1.2436397400185701, + "grad_norm": 1.6080577373504639, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8777544498443604, + "num_tokens": 244128194.0, + "step": 6697 + }, + { + "epoch": 1.2438254410399256, + "grad_norm": 1.532067894935608, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8775018453598022, + "num_tokens": 244169446.0, + "step": 6698 + }, + { + "epoch": 1.2440111420612814, + "grad_norm": 1.49006986618042, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8574365377426147, + "num_tokens": 244214859.0, + "step": 6699 + }, + { + "epoch": 1.244196843082637, + "grad_norm": 1.749766230583191, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8761441707611084, + "num_tokens": 244247554.0, + "step": 6700 + }, + { + "epoch": 1.2443825441039926, + "grad_norm": 1.5245931148529053, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8813538551330566, + "num_tokens": 244284137.0, + "step": 6701 + }, + { + "epoch": 1.2445682451253481, + "grad_norm": 1.470462679862976, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8837764263153076, + "num_tokens": 244320406.0, + "step": 6702 + }, + { + "epoch": 1.2447539461467039, + "grad_norm": 1.6055721044540405, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8644213676452637, + "num_tokens": 244357796.0, + "step": 6703 + }, + { + "epoch": 1.2449396471680594, + "grad_norm": 1.6935924291610718, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8690390586853027, + "num_tokens": 244394715.0, + "step": 6704 + }, + { + "epoch": 1.2451253481894151, + "grad_norm": 1.6172009706497192, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8675718903541565, + "num_tokens": 244431734.0, + "step": 6705 + }, + { + "epoch": 1.2453110492107706, + "grad_norm": 1.7863166332244873, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8872849941253662, + "num_tokens": 244466986.0, + "step": 6706 + }, + { + "epoch": 1.2454967502321264, + "grad_norm": 1.6104587316513062, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8733953237533569, + "num_tokens": 244502676.0, + "step": 6707 + }, + { + "epoch": 1.2456824512534819, + "grad_norm": 1.5873217582702637, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.869710385799408, + "num_tokens": 244541897.0, + "step": 6708 + }, + { + "epoch": 1.2458681522748376, + "grad_norm": 1.4588738679885864, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8852850198745728, + "num_tokens": 244582312.0, + "step": 6709 + }, + { + "epoch": 1.2460538532961931, + "grad_norm": 1.6546908617019653, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8719979524612427, + "num_tokens": 244616649.0, + "step": 6710 + }, + { + "epoch": 1.2462395543175488, + "grad_norm": 1.4637295007705688, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8763855695724487, + "num_tokens": 244656129.0, + "step": 6711 + }, + { + "epoch": 1.2464252553389044, + "grad_norm": 1.6488343477249146, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.890734076499939, + "num_tokens": 244686884.0, + "step": 6712 + }, + { + "epoch": 1.2466109563602599, + "grad_norm": 1.6410865783691406, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8864811658859253, + "num_tokens": 244722650.0, + "step": 6713 + }, + { + "epoch": 1.2467966573816156, + "grad_norm": 1.7957295179367065, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8723024129867554, + "num_tokens": 244757809.0, + "step": 6714 + }, + { + "epoch": 1.2469823584029713, + "grad_norm": 1.628715991973877, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8748934268951416, + "num_tokens": 244790742.0, + "step": 6715 + }, + { + "epoch": 1.2471680594243268, + "grad_norm": 1.6315706968307495, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8543318510055542, + "num_tokens": 244828294.0, + "step": 6716 + }, + { + "epoch": 1.2473537604456824, + "grad_norm": 1.6095027923583984, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8769205808639526, + "num_tokens": 244863156.0, + "step": 6717 + }, + { + "epoch": 1.247539461467038, + "grad_norm": 1.578841209411621, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.872214674949646, + "num_tokens": 244901414.0, + "step": 6718 + }, + { + "epoch": 1.2477251624883936, + "grad_norm": 1.7159762382507324, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8742138147354126, + "num_tokens": 244934919.0, + "step": 6719 + }, + { + "epoch": 1.2479108635097493, + "grad_norm": 1.70475435256958, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8640945553779602, + "num_tokens": 244969337.0, + "step": 6720 + }, + { + "epoch": 1.2480965645311048, + "grad_norm": 1.470733880996704, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8865199089050293, + "num_tokens": 245008386.0, + "step": 6721 + }, + { + "epoch": 1.2482822655524606, + "grad_norm": 1.611228108406067, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.867660403251648, + "num_tokens": 245041739.0, + "step": 6722 + }, + { + "epoch": 1.248467966573816, + "grad_norm": 1.6078404188156128, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8845322728157043, + "num_tokens": 245079878.0, + "step": 6723 + }, + { + "epoch": 1.2486536675951718, + "grad_norm": 1.592057704925537, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8796133995056152, + "num_tokens": 245115875.0, + "step": 6724 + }, + { + "epoch": 1.2488393686165273, + "grad_norm": 1.5241495370864868, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8846069574356079, + "num_tokens": 245150386.0, + "step": 6725 + }, + { + "epoch": 1.249025069637883, + "grad_norm": 1.49075186252594, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8821119070053101, + "num_tokens": 245186592.0, + "step": 6726 + }, + { + "epoch": 1.2492107706592386, + "grad_norm": 1.5492472648620605, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8691584467887878, + "num_tokens": 245225879.0, + "step": 6727 + }, + { + "epoch": 1.2493964716805943, + "grad_norm": 1.6007968187332153, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.88131183385849, + "num_tokens": 245258157.0, + "step": 6728 + }, + { + "epoch": 1.2495821727019498, + "grad_norm": 1.572901725769043, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8838579654693604, + "num_tokens": 245292048.0, + "step": 6729 + }, + { + "epoch": 1.2497678737233056, + "grad_norm": 1.891312837600708, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8762147426605225, + "num_tokens": 245322959.0, + "step": 6730 + }, + { + "epoch": 1.249953574744661, + "grad_norm": 1.7598280906677246, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8620654344558716, + "num_tokens": 245353708.0, + "step": 6731 + }, + { + "epoch": 1.2501392757660166, + "grad_norm": 1.8197581768035889, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8623498678207397, + "num_tokens": 245383839.0, + "step": 6732 + }, + { + "epoch": 1.2503249767873723, + "grad_norm": 1.5926826000213623, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.886628270149231, + "num_tokens": 245418457.0, + "step": 6733 + }, + { + "epoch": 1.250510677808728, + "grad_norm": 1.5477513074874878, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8725973963737488, + "num_tokens": 245459324.0, + "step": 6734 + }, + { + "epoch": 1.2506963788300836, + "grad_norm": 1.580426812171936, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8867298364639282, + "num_tokens": 245497105.0, + "step": 6735 + }, + { + "epoch": 1.250882079851439, + "grad_norm": 1.7331645488739014, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8685609698295593, + "num_tokens": 245527044.0, + "step": 6736 + }, + { + "epoch": 1.2510677808727948, + "grad_norm": 1.5769727230072021, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8748738765716553, + "num_tokens": 245562390.0, + "step": 6737 + }, + { + "epoch": 1.2512534818941505, + "grad_norm": 1.6014881134033203, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8900291919708252, + "num_tokens": 245599379.0, + "step": 6738 + }, + { + "epoch": 1.251439182915506, + "grad_norm": 1.5850762128829956, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8724993467330933, + "num_tokens": 245636393.0, + "step": 6739 + }, + { + "epoch": 1.2516248839368616, + "grad_norm": 1.410322666168213, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8982765078544617, + "num_tokens": 245675447.0, + "step": 6740 + }, + { + "epoch": 1.2518105849582173, + "grad_norm": 1.3325440883636475, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.881833553314209, + "num_tokens": 245720261.0, + "step": 6741 + }, + { + "epoch": 1.251996285979573, + "grad_norm": 1.4405314922332764, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8910882472991943, + "num_tokens": 245755569.0, + "step": 6742 + }, + { + "epoch": 1.2521819870009285, + "grad_norm": 1.4228729009628296, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8841822147369385, + "num_tokens": 245795550.0, + "step": 6743 + }, + { + "epoch": 1.252367688022284, + "grad_norm": 1.539018154144287, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8800581693649292, + "num_tokens": 245829883.0, + "step": 6744 + }, + { + "epoch": 1.2525533890436398, + "grad_norm": 1.5178171396255493, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8691116571426392, + "num_tokens": 245871285.0, + "step": 6745 + }, + { + "epoch": 1.2527390900649953, + "grad_norm": 1.5292344093322754, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8802884817123413, + "num_tokens": 245907389.0, + "step": 6746 + }, + { + "epoch": 1.252924791086351, + "grad_norm": 1.5625543594360352, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8698078393936157, + "num_tokens": 245941898.0, + "step": 6747 + }, + { + "epoch": 1.2531104921077065, + "grad_norm": 1.7203437089920044, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8785878419876099, + "num_tokens": 245974632.0, + "step": 6748 + }, + { + "epoch": 1.2532961931290623, + "grad_norm": 1.4924877882003784, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8662070035934448, + "num_tokens": 246011840.0, + "step": 6749 + }, + { + "epoch": 1.2534818941504178, + "grad_norm": 1.558575987815857, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.888434648513794, + "num_tokens": 246045891.0, + "step": 6750 + }, + { + "epoch": 1.2536675951717735, + "grad_norm": 1.5581361055374146, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8881167769432068, + "num_tokens": 246078969.0, + "step": 6751 + }, + { + "epoch": 1.253853296193129, + "grad_norm": 1.6976723670959473, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8754974007606506, + "num_tokens": 246113317.0, + "step": 6752 + }, + { + "epoch": 1.2540389972144848, + "grad_norm": 1.719387412071228, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8740564584732056, + "num_tokens": 246143750.0, + "step": 6753 + }, + { + "epoch": 1.2542246982358403, + "grad_norm": 1.522646427154541, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8862090110778809, + "num_tokens": 246179137.0, + "step": 6754 + }, + { + "epoch": 1.254410399257196, + "grad_norm": 1.477108120918274, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8833509087562561, + "num_tokens": 246218388.0, + "step": 6755 + }, + { + "epoch": 1.2545961002785515, + "grad_norm": 1.6135774850845337, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8861056566238403, + "num_tokens": 246255073.0, + "step": 6756 + }, + { + "epoch": 1.2547818012999072, + "grad_norm": 1.6275509595870972, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8781097531318665, + "num_tokens": 246287912.0, + "step": 6757 + }, + { + "epoch": 1.2549675023212628, + "grad_norm": 1.624556064605713, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8747020959854126, + "num_tokens": 246323509.0, + "step": 6758 + }, + { + "epoch": 1.2551532033426183, + "grad_norm": 1.560502052307129, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8929412961006165, + "num_tokens": 246356984.0, + "step": 6759 + }, + { + "epoch": 1.255338904363974, + "grad_norm": 1.5434757471084595, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8791806697845459, + "num_tokens": 246392747.0, + "step": 6760 + }, + { + "epoch": 1.2555246053853297, + "grad_norm": 1.5562056303024292, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8723127841949463, + "num_tokens": 246429445.0, + "step": 6761 + }, + { + "epoch": 1.2557103064066852, + "grad_norm": 1.4698206186294556, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8772748708724976, + "num_tokens": 246468303.0, + "step": 6762 + }, + { + "epoch": 1.2558960074280408, + "grad_norm": 1.5506465435028076, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8722598552703857, + "num_tokens": 246507946.0, + "step": 6763 + }, + { + "epoch": 1.2560817084493965, + "grad_norm": 1.5895766019821167, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8766512274742126, + "num_tokens": 246543721.0, + "step": 6764 + }, + { + "epoch": 1.2562674094707522, + "grad_norm": 1.5278701782226562, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.88458251953125, + "num_tokens": 246577651.0, + "step": 6765 + }, + { + "epoch": 1.2564531104921077, + "grad_norm": 1.4404302835464478, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8783105611801147, + "num_tokens": 246616413.0, + "step": 6766 + }, + { + "epoch": 1.2566388115134632, + "grad_norm": 1.3509618043899536, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.882089376449585, + "num_tokens": 246657783.0, + "step": 6767 + }, + { + "epoch": 1.256824512534819, + "grad_norm": 1.472448468208313, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8659427165985107, + "num_tokens": 246698640.0, + "step": 6768 + }, + { + "epoch": 1.2570102135561745, + "grad_norm": 1.5005732774734497, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.877281665802002, + "num_tokens": 246734427.0, + "step": 6769 + }, + { + "epoch": 1.2571959145775302, + "grad_norm": 1.6015397310256958, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.888608455657959, + "num_tokens": 246767211.0, + "step": 6770 + }, + { + "epoch": 1.2573816155988857, + "grad_norm": 1.57224440574646, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8819641470909119, + "num_tokens": 246802438.0, + "step": 6771 + }, + { + "epoch": 1.2575673166202415, + "grad_norm": 1.4721585512161255, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8915836215019226, + "num_tokens": 246840909.0, + "step": 6772 + }, + { + "epoch": 1.257753017641597, + "grad_norm": 1.5163516998291016, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.887195348739624, + "num_tokens": 246876749.0, + "step": 6773 + }, + { + "epoch": 1.2579387186629527, + "grad_norm": 1.6254608631134033, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8707807064056396, + "num_tokens": 246912026.0, + "step": 6774 + }, + { + "epoch": 1.2581244196843082, + "grad_norm": 1.51766836643219, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8839148283004761, + "num_tokens": 246950840.0, + "step": 6775 + }, + { + "epoch": 1.258310120705664, + "grad_norm": 1.5704890489578247, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8793007731437683, + "num_tokens": 246984844.0, + "step": 6776 + }, + { + "epoch": 1.2584958217270195, + "grad_norm": 1.687469244003296, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8636117577552795, + "num_tokens": 247022106.0, + "step": 6777 + }, + { + "epoch": 1.2586815227483752, + "grad_norm": 1.6054933071136475, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8757809400558472, + "num_tokens": 247058905.0, + "step": 6778 + }, + { + "epoch": 1.2588672237697307, + "grad_norm": 1.5710067749023438, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8711209297180176, + "num_tokens": 247093781.0, + "step": 6779 + }, + { + "epoch": 1.2590529247910864, + "grad_norm": 1.5791633129119873, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8781076669692993, + "num_tokens": 247130229.0, + "step": 6780 + }, + { + "epoch": 1.259238625812442, + "grad_norm": 1.5307961702346802, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8766932487487793, + "num_tokens": 247166586.0, + "step": 6781 + }, + { + "epoch": 1.2594243268337975, + "grad_norm": 1.760007381439209, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8630825877189636, + "num_tokens": 247202969.0, + "step": 6782 + }, + { + "epoch": 1.2596100278551532, + "grad_norm": 1.6746134757995605, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8649038076400757, + "num_tokens": 247238343.0, + "step": 6783 + }, + { + "epoch": 1.259795728876509, + "grad_norm": 1.5743356943130493, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8594573140144348, + "num_tokens": 247275756.0, + "step": 6784 + }, + { + "epoch": 1.2599814298978644, + "grad_norm": 1.5317511558532715, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8899859189987183, + "num_tokens": 247310817.0, + "step": 6785 + }, + { + "epoch": 1.26016713091922, + "grad_norm": 1.5002459287643433, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8912944793701172, + "num_tokens": 247347720.0, + "step": 6786 + }, + { + "epoch": 1.2603528319405757, + "grad_norm": 1.7354681491851807, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8676855564117432, + "num_tokens": 247378292.0, + "step": 6787 + }, + { + "epoch": 1.2605385329619314, + "grad_norm": 1.6212860345840454, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8826432228088379, + "num_tokens": 247410758.0, + "step": 6788 + }, + { + "epoch": 1.260724233983287, + "grad_norm": 1.510938048362732, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8850260972976685, + "num_tokens": 247446509.0, + "step": 6789 + }, + { + "epoch": 1.2609099350046424, + "grad_norm": 1.5193032026290894, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8755947947502136, + "num_tokens": 247484624.0, + "step": 6790 + }, + { + "epoch": 1.2610956360259982, + "grad_norm": 1.6651999950408936, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8718211650848389, + "num_tokens": 247517849.0, + "step": 6791 + }, + { + "epoch": 1.2612813370473537, + "grad_norm": 1.5531089305877686, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8674577474594116, + "num_tokens": 247558530.0, + "step": 6792 + }, + { + "epoch": 1.2614670380687094, + "grad_norm": 1.5783249139785767, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8749749064445496, + "num_tokens": 247594326.0, + "step": 6793 + }, + { + "epoch": 1.261652739090065, + "grad_norm": 1.6282767057418823, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8676085472106934, + "num_tokens": 247632549.0, + "step": 6794 + }, + { + "epoch": 1.2618384401114207, + "grad_norm": 1.54926335811615, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8945209980010986, + "num_tokens": 247669570.0, + "step": 6795 + }, + { + "epoch": 1.2620241411327762, + "grad_norm": 1.5685890913009644, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8709434270858765, + "num_tokens": 247708355.0, + "step": 6796 + }, + { + "epoch": 1.262209842154132, + "grad_norm": 1.6316719055175781, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8721017837524414, + "num_tokens": 247742290.0, + "step": 6797 + }, + { + "epoch": 1.2623955431754874, + "grad_norm": 1.5080235004425049, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8820868134498596, + "num_tokens": 247779555.0, + "step": 6798 + }, + { + "epoch": 1.2625812441968431, + "grad_norm": 1.5372401475906372, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8699829578399658, + "num_tokens": 247819743.0, + "step": 6799 + }, + { + "epoch": 1.2627669452181987, + "grad_norm": 1.7258052825927734, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8690156936645508, + "num_tokens": 247852454.0, + "step": 6800 + }, + { + "epoch": 1.2629526462395544, + "grad_norm": 1.5557423830032349, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8765242099761963, + "num_tokens": 247891775.0, + "step": 6801 + }, + { + "epoch": 1.26313834726091, + "grad_norm": 1.4329195022583008, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8782447576522827, + "num_tokens": 247934346.0, + "step": 6802 + }, + { + "epoch": 1.2633240482822656, + "grad_norm": 1.6817877292633057, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8790507912635803, + "num_tokens": 247966256.0, + "step": 6803 + }, + { + "epoch": 1.2635097493036211, + "grad_norm": 1.5841094255447388, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8654015064239502, + "num_tokens": 248002121.0, + "step": 6804 + }, + { + "epoch": 1.2636954503249767, + "grad_norm": 1.6195358037948608, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8796001672744751, + "num_tokens": 248033037.0, + "step": 6805 + }, + { + "epoch": 1.2638811513463324, + "grad_norm": 1.3919061422348022, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8730102777481079, + "num_tokens": 248074932.0, + "step": 6806 + }, + { + "epoch": 1.2640668523676881, + "grad_norm": 1.7242034673690796, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8827432990074158, + "num_tokens": 248103636.0, + "step": 6807 + }, + { + "epoch": 1.2642525533890436, + "grad_norm": 1.4754042625427246, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8895672559738159, + "num_tokens": 248137744.0, + "step": 6808 + }, + { + "epoch": 1.2644382544103991, + "grad_norm": 1.7558231353759766, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.875397801399231, + "num_tokens": 248167106.0, + "step": 6809 + }, + { + "epoch": 1.2646239554317549, + "grad_norm": 1.3714936971664429, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8804185390472412, + "num_tokens": 248212350.0, + "step": 6810 + }, + { + "epoch": 1.2648096564531106, + "grad_norm": 1.541377305984497, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8807576298713684, + "num_tokens": 248245349.0, + "step": 6811 + }, + { + "epoch": 1.2649953574744661, + "grad_norm": 1.5157352685928345, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8842558860778809, + "num_tokens": 248277497.0, + "step": 6812 + }, + { + "epoch": 1.2651810584958216, + "grad_norm": 1.5289455652236938, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8669639825820923, + "num_tokens": 248315404.0, + "step": 6813 + }, + { + "epoch": 1.2653667595171774, + "grad_norm": 1.62433660030365, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8831735849380493, + "num_tokens": 248349605.0, + "step": 6814 + }, + { + "epoch": 1.265552460538533, + "grad_norm": 1.6150349378585815, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8636119961738586, + "num_tokens": 248388152.0, + "step": 6815 + }, + { + "epoch": 1.2657381615598886, + "grad_norm": 1.6344252824783325, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.870262861251831, + "num_tokens": 248422732.0, + "step": 6816 + }, + { + "epoch": 1.2659238625812441, + "grad_norm": 1.6549773216247559, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8660155534744263, + "num_tokens": 248457645.0, + "step": 6817 + }, + { + "epoch": 1.2661095636025999, + "grad_norm": 1.5457133054733276, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.867192804813385, + "num_tokens": 248493977.0, + "step": 6818 + }, + { + "epoch": 1.2662952646239554, + "grad_norm": 1.6476563215255737, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8670552372932434, + "num_tokens": 248527515.0, + "step": 6819 + }, + { + "epoch": 1.266480965645311, + "grad_norm": 1.5149604082107544, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8844729065895081, + "num_tokens": 248563810.0, + "step": 6820 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.4298343658447266, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8873668909072876, + "num_tokens": 248601540.0, + "step": 6821 + }, + { + "epoch": 1.2668523676880223, + "grad_norm": 1.5997178554534912, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8746871948242188, + "num_tokens": 248635865.0, + "step": 6822 + }, + { + "epoch": 1.2670380687093779, + "grad_norm": 1.5859943628311157, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8776684999465942, + "num_tokens": 248672241.0, + "step": 6823 + }, + { + "epoch": 1.2672237697307336, + "grad_norm": 1.6654505729675293, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8780509829521179, + "num_tokens": 248704647.0, + "step": 6824 + }, + { + "epoch": 1.267409470752089, + "grad_norm": 1.6914894580841064, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8676035404205322, + "num_tokens": 248735187.0, + "step": 6825 + }, + { + "epoch": 1.2675951717734448, + "grad_norm": 1.6673476696014404, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8609994649887085, + "num_tokens": 248769373.0, + "step": 6826 + }, + { + "epoch": 1.2677808727948003, + "grad_norm": 1.616821527481079, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8707980513572693, + "num_tokens": 248804387.0, + "step": 6827 + }, + { + "epoch": 1.2679665738161559, + "grad_norm": 1.671209454536438, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8659765720367432, + "num_tokens": 248837169.0, + "step": 6828 + }, + { + "epoch": 1.2681522748375116, + "grad_norm": 1.5851799249649048, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8721575736999512, + "num_tokens": 248875851.0, + "step": 6829 + }, + { + "epoch": 1.2683379758588673, + "grad_norm": 1.4350149631500244, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8948444128036499, + "num_tokens": 248912552.0, + "step": 6830 + }, + { + "epoch": 1.2685236768802228, + "grad_norm": 1.8146644830703735, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8644376993179321, + "num_tokens": 248941431.0, + "step": 6831 + }, + { + "epoch": 1.2687093779015783, + "grad_norm": 1.6105422973632812, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8694583177566528, + "num_tokens": 248978360.0, + "step": 6832 + }, + { + "epoch": 1.268895078922934, + "grad_norm": 1.627210021018982, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8776757121086121, + "num_tokens": 249012039.0, + "step": 6833 + }, + { + "epoch": 1.2690807799442898, + "grad_norm": 1.7074202299118042, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.86176598072052, + "num_tokens": 249043908.0, + "step": 6834 + }, + { + "epoch": 1.2692664809656453, + "grad_norm": 1.5944609642028809, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8848443031311035, + "num_tokens": 249076386.0, + "step": 6835 + }, + { + "epoch": 1.2694521819870008, + "grad_norm": 1.6025060415267944, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8787535429000854, + "num_tokens": 249109386.0, + "step": 6836 + }, + { + "epoch": 1.2696378830083566, + "grad_norm": 1.6795361042022705, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.868522584438324, + "num_tokens": 249146508.0, + "step": 6837 + }, + { + "epoch": 1.2698235840297123, + "grad_norm": 1.5509788990020752, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8877884745597839, + "num_tokens": 249182592.0, + "step": 6838 + }, + { + "epoch": 1.2700092850510678, + "grad_norm": 1.6336700916290283, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.856948733329773, + "num_tokens": 249220156.0, + "step": 6839 + }, + { + "epoch": 1.2701949860724233, + "grad_norm": 1.5753400325775146, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8690345287322998, + "num_tokens": 249255830.0, + "step": 6840 + }, + { + "epoch": 1.270380687093779, + "grad_norm": 1.5555599927902222, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8836331367492676, + "num_tokens": 249291290.0, + "step": 6841 + }, + { + "epoch": 1.2705663881151346, + "grad_norm": 1.5461596250534058, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8727084994316101, + "num_tokens": 249335730.0, + "step": 6842 + }, + { + "epoch": 1.2707520891364903, + "grad_norm": 1.5579906702041626, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8672837615013123, + "num_tokens": 249373803.0, + "step": 6843 + }, + { + "epoch": 1.2709377901578458, + "grad_norm": 1.6491773128509521, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8637089729309082, + "num_tokens": 249409420.0, + "step": 6844 + }, + { + "epoch": 1.2711234911792015, + "grad_norm": 1.647383689880371, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8835099935531616, + "num_tokens": 249441302.0, + "step": 6845 + }, + { + "epoch": 1.271309192200557, + "grad_norm": 1.4741085767745972, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8816370368003845, + "num_tokens": 249479745.0, + "step": 6846 + }, + { + "epoch": 1.2714948932219128, + "grad_norm": 1.5814199447631836, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8706220984458923, + "num_tokens": 249521323.0, + "step": 6847 + }, + { + "epoch": 1.2716805942432683, + "grad_norm": 1.5248043537139893, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8711748719215393, + "num_tokens": 249559507.0, + "step": 6848 + }, + { + "epoch": 1.271866295264624, + "grad_norm": 1.5150338411331177, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8807265758514404, + "num_tokens": 249599321.0, + "step": 6849 + }, + { + "epoch": 1.2720519962859795, + "grad_norm": 1.616058588027954, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8725329041481018, + "num_tokens": 249633713.0, + "step": 6850 + }, + { + "epoch": 1.2722376973073353, + "grad_norm": 1.666628122329712, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.863534688949585, + "num_tokens": 249671657.0, + "step": 6851 + }, + { + "epoch": 1.2724233983286908, + "grad_norm": 1.4364490509033203, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8845909833908081, + "num_tokens": 249711763.0, + "step": 6852 + }, + { + "epoch": 1.2726090993500465, + "grad_norm": 1.617149829864502, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8806964159011841, + "num_tokens": 249744039.0, + "step": 6853 + }, + { + "epoch": 1.272794800371402, + "grad_norm": 1.610657811164856, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8785833120346069, + "num_tokens": 249778015.0, + "step": 6854 + }, + { + "epoch": 1.2729805013927575, + "grad_norm": 1.5452362298965454, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8742958307266235, + "num_tokens": 249813942.0, + "step": 6855 + }, + { + "epoch": 1.2731662024141133, + "grad_norm": 1.5510245561599731, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8782636523246765, + "num_tokens": 249851311.0, + "step": 6856 + }, + { + "epoch": 1.273351903435469, + "grad_norm": 1.63480544090271, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8751652240753174, + "num_tokens": 249886307.0, + "step": 6857 + }, + { + "epoch": 1.2735376044568245, + "grad_norm": 1.679339051246643, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8683178424835205, + "num_tokens": 249923558.0, + "step": 6858 + }, + { + "epoch": 1.27372330547818, + "grad_norm": 1.565685510635376, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8813640475273132, + "num_tokens": 249960712.0, + "step": 6859 + }, + { + "epoch": 1.2739090064995358, + "grad_norm": 1.5815949440002441, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8782441020011902, + "num_tokens": 249998619.0, + "step": 6860 + }, + { + "epoch": 1.2740947075208915, + "grad_norm": 1.4814823865890503, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8852053284645081, + "num_tokens": 250037193.0, + "step": 6861 + }, + { + "epoch": 1.274280408542247, + "grad_norm": 1.5444912910461426, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.868043065071106, + "num_tokens": 250072033.0, + "step": 6862 + }, + { + "epoch": 1.2744661095636025, + "grad_norm": 1.4751567840576172, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8983221054077148, + "num_tokens": 250106095.0, + "step": 6863 + }, + { + "epoch": 1.2746518105849582, + "grad_norm": 1.604487419128418, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8797777891159058, + "num_tokens": 250141121.0, + "step": 6864 + }, + { + "epoch": 1.2748375116063138, + "grad_norm": 1.6529314517974854, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8900303840637207, + "num_tokens": 250172000.0, + "step": 6865 + }, + { + "epoch": 1.2750232126276695, + "grad_norm": 1.5349438190460205, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8762378692626953, + "num_tokens": 250205361.0, + "step": 6866 + }, + { + "epoch": 1.275208913649025, + "grad_norm": 1.7289668321609497, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8727031946182251, + "num_tokens": 250241534.0, + "step": 6867 + }, + { + "epoch": 1.2753946146703807, + "grad_norm": 1.5090806484222412, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8884254693984985, + "num_tokens": 250276950.0, + "step": 6868 + }, + { + "epoch": 1.2755803156917362, + "grad_norm": 1.4081854820251465, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8917320370674133, + "num_tokens": 250313501.0, + "step": 6869 + }, + { + "epoch": 1.275766016713092, + "grad_norm": 1.5980604887008667, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8829873204231262, + "num_tokens": 250346799.0, + "step": 6870 + }, + { + "epoch": 1.2759517177344475, + "grad_norm": 1.5401407480239868, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8703760504722595, + "num_tokens": 250386143.0, + "step": 6871 + }, + { + "epoch": 1.2761374187558032, + "grad_norm": 1.6948994398117065, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8670413494110107, + "num_tokens": 250417925.0, + "step": 6872 + }, + { + "epoch": 1.2763231197771587, + "grad_norm": 1.4417436122894287, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8763130903244019, + "num_tokens": 250458769.0, + "step": 6873 + }, + { + "epoch": 1.2765088207985145, + "grad_norm": 1.6569905281066895, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8718987703323364, + "num_tokens": 250491320.0, + "step": 6874 + }, + { + "epoch": 1.27669452181987, + "grad_norm": 1.629647135734558, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.864441990852356, + "num_tokens": 250526200.0, + "step": 6875 + }, + { + "epoch": 1.2768802228412257, + "grad_norm": 1.5492976903915405, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8785282373428345, + "num_tokens": 250562728.0, + "step": 6876 + }, + { + "epoch": 1.2770659238625812, + "grad_norm": 1.5280357599258423, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8849789500236511, + "num_tokens": 250597150.0, + "step": 6877 + }, + { + "epoch": 1.2772516248839367, + "grad_norm": 1.6063897609710693, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8744339346885681, + "num_tokens": 250633498.0, + "step": 6878 + }, + { + "epoch": 1.2774373259052925, + "grad_norm": 1.5042450428009033, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8946118354797363, + "num_tokens": 250666360.0, + "step": 6879 + }, + { + "epoch": 1.2776230269266482, + "grad_norm": 1.5293203592300415, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8766949772834778, + "num_tokens": 250707334.0, + "step": 6880 + }, + { + "epoch": 1.2778087279480037, + "grad_norm": 1.6515846252441406, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8779571056365967, + "num_tokens": 250738758.0, + "step": 6881 + }, + { + "epoch": 1.2779944289693592, + "grad_norm": 1.583065152168274, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8788015842437744, + "num_tokens": 250772655.0, + "step": 6882 + }, + { + "epoch": 1.278180129990715, + "grad_norm": 1.6970988512039185, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8787180781364441, + "num_tokens": 250803575.0, + "step": 6883 + }, + { + "epoch": 1.2783658310120707, + "grad_norm": 1.5689196586608887, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8677281737327576, + "num_tokens": 250841110.0, + "step": 6884 + }, + { + "epoch": 1.2785515320334262, + "grad_norm": 1.4824721813201904, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8805841207504272, + "num_tokens": 250880331.0, + "step": 6885 + }, + { + "epoch": 1.2787372330547817, + "grad_norm": 1.6196913719177246, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8718220591545105, + "num_tokens": 250914780.0, + "step": 6886 + }, + { + "epoch": 1.2789229340761374, + "grad_norm": 1.580976128578186, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8801762461662292, + "num_tokens": 250948115.0, + "step": 6887 + }, + { + "epoch": 1.2791086350974932, + "grad_norm": 1.534481406211853, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8819515705108643, + "num_tokens": 250986323.0, + "step": 6888 + }, + { + "epoch": 1.2792943361188487, + "grad_norm": 1.620582938194275, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8734406232833862, + "num_tokens": 251023588.0, + "step": 6889 + }, + { + "epoch": 1.2794800371402042, + "grad_norm": 1.5824756622314453, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8489559292793274, + "num_tokens": 251063730.0, + "step": 6890 + }, + { + "epoch": 1.27966573816156, + "grad_norm": 1.54727303981781, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8856834769248962, + "num_tokens": 251097996.0, + "step": 6891 + }, + { + "epoch": 1.2798514391829154, + "grad_norm": 1.4959959983825684, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8751642107963562, + "num_tokens": 251136619.0, + "step": 6892 + }, + { + "epoch": 1.2800371402042712, + "grad_norm": 1.4732924699783325, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.867929220199585, + "num_tokens": 251180422.0, + "step": 6893 + }, + { + "epoch": 1.2802228412256267, + "grad_norm": 1.5845787525177002, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8845264911651611, + "num_tokens": 251213416.0, + "step": 6894 + }, + { + "epoch": 1.2804085422469824, + "grad_norm": 1.6145929098129272, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.879198431968689, + "num_tokens": 251247505.0, + "step": 6895 + }, + { + "epoch": 1.280594243268338, + "grad_norm": 1.6694692373275757, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8728379607200623, + "num_tokens": 251278038.0, + "step": 6896 + }, + { + "epoch": 1.2807799442896937, + "grad_norm": 1.5316582918167114, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8820937871932983, + "num_tokens": 251315761.0, + "step": 6897 + }, + { + "epoch": 1.2809656453110492, + "grad_norm": 1.5638781785964966, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8737502098083496, + "num_tokens": 251354082.0, + "step": 6898 + }, + { + "epoch": 1.281151346332405, + "grad_norm": 1.5090930461883545, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8910622596740723, + "num_tokens": 251389378.0, + "step": 6899 + }, + { + "epoch": 1.2813370473537604, + "grad_norm": 1.5751515626907349, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8886003494262695, + "num_tokens": 251427259.0, + "step": 6900 + }, + { + "epoch": 1.281522748375116, + "grad_norm": 1.5472490787506104, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8801528215408325, + "num_tokens": 251466819.0, + "step": 6901 + }, + { + "epoch": 1.2817084493964717, + "grad_norm": 1.5135725736618042, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.878459095954895, + "num_tokens": 251502132.0, + "step": 6902 + }, + { + "epoch": 1.2818941504178274, + "grad_norm": 1.6247245073318481, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8787716627120972, + "num_tokens": 251540848.0, + "step": 6903 + }, + { + "epoch": 1.282079851439183, + "grad_norm": 1.5044631958007812, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.874265193939209, + "num_tokens": 251579867.0, + "step": 6904 + }, + { + "epoch": 1.2822655524605384, + "grad_norm": 1.6743323802947998, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8653292059898376, + "num_tokens": 251613146.0, + "step": 6905 + }, + { + "epoch": 1.2824512534818941, + "grad_norm": 1.5679882764816284, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8857669830322266, + "num_tokens": 251648283.0, + "step": 6906 + }, + { + "epoch": 1.2826369545032499, + "grad_norm": 1.795657753944397, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8720058798789978, + "num_tokens": 251675981.0, + "step": 6907 + }, + { + "epoch": 1.2828226555246054, + "grad_norm": 1.7431169748306274, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8849301934242249, + "num_tokens": 251706932.0, + "step": 6908 + }, + { + "epoch": 1.283008356545961, + "grad_norm": 1.5066090822219849, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8708105087280273, + "num_tokens": 251745721.0, + "step": 6909 + }, + { + "epoch": 1.2831940575673166, + "grad_norm": 1.5092275142669678, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8737170696258545, + "num_tokens": 251786969.0, + "step": 6910 + }, + { + "epoch": 1.2833797585886724, + "grad_norm": 1.6306393146514893, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8643476963043213, + "num_tokens": 251825149.0, + "step": 6911 + }, + { + "epoch": 1.2835654596100279, + "grad_norm": 1.5731594562530518, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8644068241119385, + "num_tokens": 251863460.0, + "step": 6912 + }, + { + "epoch": 1.2837511606313834, + "grad_norm": 1.5376414060592651, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8747601509094238, + "num_tokens": 251902179.0, + "step": 6913 + }, + { + "epoch": 1.2839368616527391, + "grad_norm": 1.7712681293487549, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8548983335494995, + "num_tokens": 251939357.0, + "step": 6914 + }, + { + "epoch": 1.2841225626740946, + "grad_norm": 1.6579440832138062, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8688770532608032, + "num_tokens": 251977425.0, + "step": 6915 + }, + { + "epoch": 1.2843082636954504, + "grad_norm": 1.5601381063461304, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8701235055923462, + "num_tokens": 252013971.0, + "step": 6916 + }, + { + "epoch": 1.2844939647168059, + "grad_norm": 1.4757474660873413, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8879182934761047, + "num_tokens": 252051599.0, + "step": 6917 + }, + { + "epoch": 1.2846796657381616, + "grad_norm": 1.5490838289260864, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8812615275382996, + "num_tokens": 252087620.0, + "step": 6918 + }, + { + "epoch": 1.2848653667595171, + "grad_norm": 1.5203096866607666, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8720455169677734, + "num_tokens": 252127310.0, + "step": 6919 + }, + { + "epoch": 1.2850510677808729, + "grad_norm": 1.5847442150115967, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8857631683349609, + "num_tokens": 252160509.0, + "step": 6920 + }, + { + "epoch": 1.2852367688022284, + "grad_norm": 1.5354841947555542, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8683158159255981, + "num_tokens": 252201513.0, + "step": 6921 + }, + { + "epoch": 1.285422469823584, + "grad_norm": 1.6568398475646973, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.86519855260849, + "num_tokens": 252237886.0, + "step": 6922 + }, + { + "epoch": 1.2856081708449396, + "grad_norm": 1.7157093286514282, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8807985782623291, + "num_tokens": 252269913.0, + "step": 6923 + }, + { + "epoch": 1.2857938718662953, + "grad_norm": 1.5676988363265991, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8638893961906433, + "num_tokens": 252306793.0, + "step": 6924 + }, + { + "epoch": 1.2859795728876509, + "grad_norm": 1.6009975671768188, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8759158253669739, + "num_tokens": 252343213.0, + "step": 6925 + }, + { + "epoch": 1.2861652739090066, + "grad_norm": 1.6533193588256836, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8767807483673096, + "num_tokens": 252378523.0, + "step": 6926 + }, + { + "epoch": 1.286350974930362, + "grad_norm": 1.6502950191497803, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8619691133499146, + "num_tokens": 252414321.0, + "step": 6927 + }, + { + "epoch": 1.2865366759517176, + "grad_norm": 1.5996456146240234, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8818064332008362, + "num_tokens": 252453789.0, + "step": 6928 + }, + { + "epoch": 1.2867223769730733, + "grad_norm": 1.5966733694076538, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.865653395652771, + "num_tokens": 252492466.0, + "step": 6929 + }, + { + "epoch": 1.286908077994429, + "grad_norm": 1.467471957206726, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8685766458511353, + "num_tokens": 252532180.0, + "step": 6930 + }, + { + "epoch": 1.2870937790157846, + "grad_norm": 1.5092226266860962, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8724299669265747, + "num_tokens": 252569389.0, + "step": 6931 + }, + { + "epoch": 1.28727948003714, + "grad_norm": 1.672257661819458, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8809951543807983, + "num_tokens": 252598534.0, + "step": 6932 + }, + { + "epoch": 1.2874651810584958, + "grad_norm": 1.4888700246810913, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8853771686553955, + "num_tokens": 252637910.0, + "step": 6933 + }, + { + "epoch": 1.2876508820798516, + "grad_norm": 1.6754847764968872, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8707146644592285, + "num_tokens": 252673256.0, + "step": 6934 + }, + { + "epoch": 1.287836583101207, + "grad_norm": 1.5494624376296997, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8820744752883911, + "num_tokens": 252713599.0, + "step": 6935 + }, + { + "epoch": 1.2880222841225626, + "grad_norm": 1.6943490505218506, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.871328592300415, + "num_tokens": 252746814.0, + "step": 6936 + }, + { + "epoch": 1.2882079851439183, + "grad_norm": 1.5194860696792603, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.874774694442749, + "num_tokens": 252783316.0, + "step": 6937 + }, + { + "epoch": 1.2883936861652738, + "grad_norm": 1.6666775941848755, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8750683665275574, + "num_tokens": 252817529.0, + "step": 6938 + }, + { + "epoch": 1.2885793871866296, + "grad_norm": 1.423279047012329, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8964893817901611, + "num_tokens": 252854131.0, + "step": 6939 + }, + { + "epoch": 1.288765088207985, + "grad_norm": 1.626947283744812, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8753639459609985, + "num_tokens": 252892451.0, + "step": 6940 + }, + { + "epoch": 1.2889507892293408, + "grad_norm": 1.603650689125061, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8717451691627502, + "num_tokens": 252927491.0, + "step": 6941 + }, + { + "epoch": 1.2891364902506963, + "grad_norm": 1.6180158853530884, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8755533695220947, + "num_tokens": 252961551.0, + "step": 6942 + }, + { + "epoch": 1.289322191272052, + "grad_norm": 1.376415491104126, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8901492357254028, + "num_tokens": 253002431.0, + "step": 6943 + }, + { + "epoch": 1.2895078922934076, + "grad_norm": 1.4357818365097046, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.877701997756958, + "num_tokens": 253043343.0, + "step": 6944 + }, + { + "epoch": 1.2896935933147633, + "grad_norm": 1.678974986076355, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8713616132736206, + "num_tokens": 253075522.0, + "step": 6945 + }, + { + "epoch": 1.2898792943361188, + "grad_norm": 1.5385463237762451, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.866213321685791, + "num_tokens": 253122464.0, + "step": 6946 + }, + { + "epoch": 1.2900649953574745, + "grad_norm": 1.462026834487915, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8774142265319824, + "num_tokens": 253162270.0, + "step": 6947 + }, + { + "epoch": 1.29025069637883, + "grad_norm": 1.629630208015442, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8598195314407349, + "num_tokens": 253198408.0, + "step": 6948 + }, + { + "epoch": 1.2904363974001858, + "grad_norm": 1.5007548332214355, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8770397901535034, + "num_tokens": 253237114.0, + "step": 6949 + }, + { + "epoch": 1.2906220984215413, + "grad_norm": 1.633614182472229, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8802106380462646, + "num_tokens": 253271617.0, + "step": 6950 + }, + { + "epoch": 1.2908077994428968, + "grad_norm": 1.5777009725570679, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8818994760513306, + "num_tokens": 253304161.0, + "step": 6951 + }, + { + "epoch": 1.2909935004642525, + "grad_norm": 1.607980728149414, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8715204000473022, + "num_tokens": 253342864.0, + "step": 6952 + }, + { + "epoch": 1.2911792014856083, + "grad_norm": 1.4914954900741577, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8768841028213501, + "num_tokens": 253381054.0, + "step": 6953 + }, + { + "epoch": 1.2913649025069638, + "grad_norm": 1.4618425369262695, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8792985081672668, + "num_tokens": 253417760.0, + "step": 6954 + }, + { + "epoch": 1.2915506035283193, + "grad_norm": 1.7206143140792847, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8859119415283203, + "num_tokens": 253446640.0, + "step": 6955 + }, + { + "epoch": 1.291736304549675, + "grad_norm": 1.5958811044692993, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8644787669181824, + "num_tokens": 253484308.0, + "step": 6956 + }, + { + "epoch": 1.2919220055710308, + "grad_norm": 1.7044727802276611, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8759562969207764, + "num_tokens": 253515412.0, + "step": 6957 + }, + { + "epoch": 1.2921077065923863, + "grad_norm": 1.5032016038894653, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8711745738983154, + "num_tokens": 253556963.0, + "step": 6958 + }, + { + "epoch": 1.2922934076137418, + "grad_norm": 1.618327021598816, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8617627620697021, + "num_tokens": 253594811.0, + "step": 6959 + }, + { + "epoch": 1.2924791086350975, + "grad_norm": 1.5551804304122925, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8754909634590149, + "num_tokens": 253633574.0, + "step": 6960 + }, + { + "epoch": 1.292664809656453, + "grad_norm": 1.6596684455871582, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8634977340698242, + "num_tokens": 253670367.0, + "step": 6961 + }, + { + "epoch": 1.2928505106778088, + "grad_norm": 1.560849666595459, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8816611766815186, + "num_tokens": 253703976.0, + "step": 6962 + }, + { + "epoch": 1.2930362116991643, + "grad_norm": 1.4055571556091309, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8864651322364807, + "num_tokens": 253746465.0, + "step": 6963 + }, + { + "epoch": 1.29322191272052, + "grad_norm": 1.453118085861206, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8747396469116211, + "num_tokens": 253790364.0, + "step": 6964 + }, + { + "epoch": 1.2934076137418755, + "grad_norm": 1.5279326438903809, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8762043714523315, + "num_tokens": 253832186.0, + "step": 6965 + }, + { + "epoch": 1.2935933147632313, + "grad_norm": 1.6365420818328857, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8687911033630371, + "num_tokens": 253866430.0, + "step": 6966 + }, + { + "epoch": 1.2937790157845868, + "grad_norm": 1.5879257917404175, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8607516288757324, + "num_tokens": 253902713.0, + "step": 6967 + }, + { + "epoch": 1.2939647168059425, + "grad_norm": 1.5259990692138672, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8764050006866455, + "num_tokens": 253940545.0, + "step": 6968 + }, + { + "epoch": 1.294150417827298, + "grad_norm": 1.662329912185669, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8680828213691711, + "num_tokens": 253972110.0, + "step": 6969 + }, + { + "epoch": 1.2943361188486537, + "grad_norm": 1.4090113639831543, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8682938814163208, + "num_tokens": 254018718.0, + "step": 6970 + }, + { + "epoch": 1.2945218198700092, + "grad_norm": 1.664138913154602, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8743914365768433, + "num_tokens": 254050273.0, + "step": 6971 + }, + { + "epoch": 1.294707520891365, + "grad_norm": 1.5430781841278076, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.881576418876648, + "num_tokens": 254084581.0, + "step": 6972 + }, + { + "epoch": 1.2948932219127205, + "grad_norm": 1.6174001693725586, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8590050935745239, + "num_tokens": 254119499.0, + "step": 6973 + }, + { + "epoch": 1.295078922934076, + "grad_norm": 1.5814790725708008, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8647947311401367, + "num_tokens": 254155683.0, + "step": 6974 + }, + { + "epoch": 1.2952646239554317, + "grad_norm": 1.4918326139450073, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8831555843353271, + "num_tokens": 254194848.0, + "step": 6975 + }, + { + "epoch": 1.2954503249767875, + "grad_norm": 1.5340219736099243, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8796532154083252, + "num_tokens": 254235107.0, + "step": 6976 + }, + { + "epoch": 1.295636025998143, + "grad_norm": 1.50364089012146, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.854933500289917, + "num_tokens": 254273927.0, + "step": 6977 + }, + { + "epoch": 1.2958217270194985, + "grad_norm": 1.4430711269378662, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8788192272186279, + "num_tokens": 254317404.0, + "step": 6978 + }, + { + "epoch": 1.2960074280408542, + "grad_norm": 1.5202919244766235, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8744840025901794, + "num_tokens": 254353821.0, + "step": 6979 + }, + { + "epoch": 1.29619312906221, + "grad_norm": 1.5637272596359253, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8778252601623535, + "num_tokens": 254385877.0, + "step": 6980 + }, + { + "epoch": 1.2963788300835655, + "grad_norm": 1.466344952583313, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8719762563705444, + "num_tokens": 254425302.0, + "step": 6981 + }, + { + "epoch": 1.296564531104921, + "grad_norm": 1.697243332862854, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8737122416496277, + "num_tokens": 254457710.0, + "step": 6982 + }, + { + "epoch": 1.2967502321262767, + "grad_norm": 1.671197533607483, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8848506212234497, + "num_tokens": 254491027.0, + "step": 6983 + }, + { + "epoch": 1.2969359331476324, + "grad_norm": 1.419124722480774, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8959169983863831, + "num_tokens": 254529333.0, + "step": 6984 + }, + { + "epoch": 1.297121634168988, + "grad_norm": 1.5803234577178955, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8670912981033325, + "num_tokens": 254565775.0, + "step": 6985 + }, + { + "epoch": 1.2973073351903435, + "grad_norm": 1.5178481340408325, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8911411762237549, + "num_tokens": 254600343.0, + "step": 6986 + }, + { + "epoch": 1.2974930362116992, + "grad_norm": 1.7050756216049194, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8899680376052856, + "num_tokens": 254629923.0, + "step": 6987 + }, + { + "epoch": 1.2976787372330547, + "grad_norm": 1.5328364372253418, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8524214029312134, + "num_tokens": 254669884.0, + "step": 6988 + }, + { + "epoch": 1.2978644382544104, + "grad_norm": 1.4811471700668335, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8737186193466187, + "num_tokens": 254711101.0, + "step": 6989 + }, + { + "epoch": 1.298050139275766, + "grad_norm": 1.496616005897522, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8713414669036865, + "num_tokens": 254749402.0, + "step": 6990 + }, + { + "epoch": 1.2982358402971217, + "grad_norm": 1.451135516166687, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8729203939437866, + "num_tokens": 254792054.0, + "step": 6991 + }, + { + "epoch": 1.2984215413184772, + "grad_norm": 1.6359045505523682, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8602923154830933, + "num_tokens": 254830606.0, + "step": 6992 + }, + { + "epoch": 1.298607242339833, + "grad_norm": 1.4054208993911743, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8939598202705383, + "num_tokens": 254867939.0, + "step": 6993 + }, + { + "epoch": 1.2987929433611884, + "grad_norm": 1.5521354675292969, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8842347860336304, + "num_tokens": 254901590.0, + "step": 6994 + }, + { + "epoch": 1.2989786443825442, + "grad_norm": 1.5787808895111084, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.868084728717804, + "num_tokens": 254935846.0, + "step": 6995 + }, + { + "epoch": 1.2991643454038997, + "grad_norm": 1.6566661596298218, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8707075715065002, + "num_tokens": 254966862.0, + "step": 6996 + }, + { + "epoch": 1.2993500464252552, + "grad_norm": 1.4906387329101562, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8822134733200073, + "num_tokens": 255005825.0, + "step": 6997 + }, + { + "epoch": 1.299535747446611, + "grad_norm": 1.4033557176589966, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8794751763343811, + "num_tokens": 255048279.0, + "step": 6998 + }, + { + "epoch": 1.2997214484679667, + "grad_norm": 1.4209917783737183, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8848276138305664, + "num_tokens": 255088698.0, + "step": 6999 + }, + { + "epoch": 1.2999071494893222, + "grad_norm": 1.6915993690490723, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8705718517303467, + "num_tokens": 255123863.0, + "step": 7000 + }, + { + "epoch": 1.3000928505106777, + "grad_norm": 1.5659677982330322, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8853738307952881, + "num_tokens": 255159100.0, + "step": 7001 + }, + { + "epoch": 1.3002785515320334, + "grad_norm": 1.6035135984420776, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8863502144813538, + "num_tokens": 255191177.0, + "step": 7002 + }, + { + "epoch": 1.3004642525533892, + "grad_norm": 1.6123048067092896, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8939613699913025, + "num_tokens": 255223403.0, + "step": 7003 + }, + { + "epoch": 1.3006499535747447, + "grad_norm": 1.5664598941802979, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8750325441360474, + "num_tokens": 255257295.0, + "step": 7004 + }, + { + "epoch": 1.3008356545961002, + "grad_norm": 1.7230892181396484, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8902866244316101, + "num_tokens": 255290951.0, + "step": 7005 + }, + { + "epoch": 1.301021355617456, + "grad_norm": 1.7887203693389893, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8848646879196167, + "num_tokens": 255320363.0, + "step": 7006 + }, + { + "epoch": 1.3012070566388116, + "grad_norm": 1.4487848281860352, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8844049572944641, + "num_tokens": 255359732.0, + "step": 7007 + }, + { + "epoch": 1.3013927576601672, + "grad_norm": 1.6451611518859863, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8741541504859924, + "num_tokens": 255394349.0, + "step": 7008 + }, + { + "epoch": 1.3015784586815227, + "grad_norm": 1.5161679983139038, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8709613084793091, + "num_tokens": 255433843.0, + "step": 7009 + }, + { + "epoch": 1.3017641597028784, + "grad_norm": 1.6343843936920166, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8722749352455139, + "num_tokens": 255467600.0, + "step": 7010 + }, + { + "epoch": 1.301949860724234, + "grad_norm": 1.4047375917434692, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8852266073226929, + "num_tokens": 255507591.0, + "step": 7011 + }, + { + "epoch": 1.3021355617455896, + "grad_norm": 1.4612410068511963, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8774356842041016, + "num_tokens": 255545434.0, + "step": 7012 + }, + { + "epoch": 1.3023212627669452, + "grad_norm": 1.5403300523757935, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8858647346496582, + "num_tokens": 255580377.0, + "step": 7013 + }, + { + "epoch": 1.3025069637883009, + "grad_norm": 1.4141148328781128, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8886234760284424, + "num_tokens": 255624472.0, + "step": 7014 + }, + { + "epoch": 1.3026926648096564, + "grad_norm": 1.515898585319519, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8806294202804565, + "num_tokens": 255663312.0, + "step": 7015 + }, + { + "epoch": 1.3028783658310121, + "grad_norm": 1.4508689641952515, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8785988092422485, + "num_tokens": 255701746.0, + "step": 7016 + }, + { + "epoch": 1.3030640668523676, + "grad_norm": 1.6876016855239868, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8798806667327881, + "num_tokens": 255734253.0, + "step": 7017 + }, + { + "epoch": 1.3032497678737234, + "grad_norm": 1.7407816648483276, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.867089569568634, + "num_tokens": 255767072.0, + "step": 7018 + }, + { + "epoch": 1.3034354688950789, + "grad_norm": 1.5523673295974731, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8840303421020508, + "num_tokens": 255800075.0, + "step": 7019 + }, + { + "epoch": 1.3036211699164346, + "grad_norm": 1.5368762016296387, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8792790174484253, + "num_tokens": 255834075.0, + "step": 7020 + }, + { + "epoch": 1.3038068709377901, + "grad_norm": 1.4315159320831299, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8747364282608032, + "num_tokens": 255873566.0, + "step": 7021 + }, + { + "epoch": 1.3039925719591459, + "grad_norm": 1.505990743637085, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8902077674865723, + "num_tokens": 255908854.0, + "step": 7022 + }, + { + "epoch": 1.3041782729805014, + "grad_norm": 1.566343069076538, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8658027052879333, + "num_tokens": 255949189.0, + "step": 7023 + }, + { + "epoch": 1.3043639740018569, + "grad_norm": 1.5922110080718994, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8763263821601868, + "num_tokens": 255985593.0, + "step": 7024 + }, + { + "epoch": 1.3045496750232126, + "grad_norm": 1.6591027975082397, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8677449822425842, + "num_tokens": 256020217.0, + "step": 7025 + }, + { + "epoch": 1.3047353760445684, + "grad_norm": 1.4679388999938965, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8915508985519409, + "num_tokens": 256053766.0, + "step": 7026 + }, + { + "epoch": 1.3049210770659239, + "grad_norm": 1.510960340499878, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8844559788703918, + "num_tokens": 256088900.0, + "step": 7027 + }, + { + "epoch": 1.3051067780872794, + "grad_norm": 1.5173248052597046, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8781449794769287, + "num_tokens": 256122767.0, + "step": 7028 + }, + { + "epoch": 1.305292479108635, + "grad_norm": 1.5546233654022217, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8769335746765137, + "num_tokens": 256160314.0, + "step": 7029 + }, + { + "epoch": 1.3054781801299908, + "grad_norm": 1.3987233638763428, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8788187503814697, + "num_tokens": 256201311.0, + "step": 7030 + }, + { + "epoch": 1.3056638811513464, + "grad_norm": 1.4818882942199707, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.883858859539032, + "num_tokens": 256238714.0, + "step": 7031 + }, + { + "epoch": 1.3058495821727019, + "grad_norm": 1.3905678987503052, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8814120888710022, + "num_tokens": 256280049.0, + "step": 7032 + }, + { + "epoch": 1.3060352831940576, + "grad_norm": 1.5337482690811157, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8730872273445129, + "num_tokens": 256315949.0, + "step": 7033 + }, + { + "epoch": 1.306220984215413, + "grad_norm": 1.3467079401016235, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8832160234451294, + "num_tokens": 256358996.0, + "step": 7034 + }, + { + "epoch": 1.3064066852367688, + "grad_norm": 1.6929293870925903, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8650485277175903, + "num_tokens": 256393603.0, + "step": 7035 + }, + { + "epoch": 1.3065923862581243, + "grad_norm": 1.4109888076782227, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8838393688201904, + "num_tokens": 256433355.0, + "step": 7036 + }, + { + "epoch": 1.30677808727948, + "grad_norm": 1.5335407257080078, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8916250467300415, + "num_tokens": 256469878.0, + "step": 7037 + }, + { + "epoch": 1.3069637883008356, + "grad_norm": 1.3755452632904053, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8724254369735718, + "num_tokens": 256515451.0, + "step": 7038 + }, + { + "epoch": 1.3071494893221913, + "grad_norm": 1.5391427278518677, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8751721978187561, + "num_tokens": 256549514.0, + "step": 7039 + }, + { + "epoch": 1.3073351903435468, + "grad_norm": 1.5408306121826172, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8751716613769531, + "num_tokens": 256586509.0, + "step": 7040 + }, + { + "epoch": 1.3075208913649026, + "grad_norm": 1.4004148244857788, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8861629962921143, + "num_tokens": 256626932.0, + "step": 7041 + }, + { + "epoch": 1.307706592386258, + "grad_norm": 1.5971330404281616, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.868338406085968, + "num_tokens": 256663112.0, + "step": 7042 + }, + { + "epoch": 1.3078922934076138, + "grad_norm": 1.5172854661941528, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8738969564437866, + "num_tokens": 256702327.0, + "step": 7043 + }, + { + "epoch": 1.3080779944289693, + "grad_norm": 1.5453619956970215, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8681384921073914, + "num_tokens": 256743066.0, + "step": 7044 + }, + { + "epoch": 1.308263695450325, + "grad_norm": 1.4995001554489136, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8981984853744507, + "num_tokens": 256776878.0, + "step": 7045 + }, + { + "epoch": 1.3084493964716806, + "grad_norm": 1.4616984128952026, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8712223172187805, + "num_tokens": 256815829.0, + "step": 7046 + }, + { + "epoch": 1.308635097493036, + "grad_norm": 1.4601600170135498, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8846145868301392, + "num_tokens": 256854309.0, + "step": 7047 + }, + { + "epoch": 1.3088207985143918, + "grad_norm": 1.5671143531799316, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8731272220611572, + "num_tokens": 256890128.0, + "step": 7048 + }, + { + "epoch": 1.3090064995357475, + "grad_norm": 1.5860897302627563, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8817452192306519, + "num_tokens": 256924922.0, + "step": 7049 + }, + { + "epoch": 1.309192200557103, + "grad_norm": 1.5310137271881104, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8703285455703735, + "num_tokens": 256964565.0, + "step": 7050 + }, + { + "epoch": 1.3093779015784586, + "grad_norm": 1.618786334991455, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8822406530380249, + "num_tokens": 256998162.0, + "step": 7051 + }, + { + "epoch": 1.3095636025998143, + "grad_norm": 1.5633225440979004, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8746494650840759, + "num_tokens": 257034022.0, + "step": 7052 + }, + { + "epoch": 1.30974930362117, + "grad_norm": 1.7704195976257324, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8766372203826904, + "num_tokens": 257065794.0, + "step": 7053 + }, + { + "epoch": 1.3099350046425255, + "grad_norm": 1.5354982614517212, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8821671009063721, + "num_tokens": 257102689.0, + "step": 7054 + }, + { + "epoch": 1.310120705663881, + "grad_norm": 1.5083367824554443, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8784116506576538, + "num_tokens": 257139256.0, + "step": 7055 + }, + { + "epoch": 1.3103064066852368, + "grad_norm": 1.5938977003097534, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8663809895515442, + "num_tokens": 257178130.0, + "step": 7056 + }, + { + "epoch": 1.3104921077065925, + "grad_norm": 1.6922646760940552, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8795427083969116, + "num_tokens": 257210635.0, + "step": 7057 + }, + { + "epoch": 1.310677808727948, + "grad_norm": 1.6195735931396484, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8692219853401184, + "num_tokens": 257249330.0, + "step": 7058 + }, + { + "epoch": 1.3108635097493035, + "grad_norm": 1.5216983556747437, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8812183737754822, + "num_tokens": 257284384.0, + "step": 7059 + }, + { + "epoch": 1.3110492107706593, + "grad_norm": 1.6206324100494385, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8816810846328735, + "num_tokens": 257317289.0, + "step": 7060 + }, + { + "epoch": 1.3112349117920148, + "grad_norm": 1.510534644126892, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8803489804267883, + "num_tokens": 257356523.0, + "step": 7061 + }, + { + "epoch": 1.3114206128133705, + "grad_norm": 1.619518756866455, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8612735271453857, + "num_tokens": 257390937.0, + "step": 7062 + }, + { + "epoch": 1.311606313834726, + "grad_norm": 1.4573537111282349, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8845750093460083, + "num_tokens": 257431014.0, + "step": 7063 + }, + { + "epoch": 1.3117920148560818, + "grad_norm": 1.5089002847671509, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.87715083360672, + "num_tokens": 257467094.0, + "step": 7064 + }, + { + "epoch": 1.3119777158774373, + "grad_norm": 1.4370839595794678, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8836493492126465, + "num_tokens": 257504776.0, + "step": 7065 + }, + { + "epoch": 1.312163416898793, + "grad_norm": 1.495712399482727, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8687452077865601, + "num_tokens": 257550120.0, + "step": 7066 + }, + { + "epoch": 1.3123491179201485, + "grad_norm": 1.7041457891464233, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8643584847450256, + "num_tokens": 257586604.0, + "step": 7067 + }, + { + "epoch": 1.3125348189415043, + "grad_norm": 1.6622748374938965, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8716213703155518, + "num_tokens": 257620071.0, + "step": 7068 + }, + { + "epoch": 1.3127205199628598, + "grad_norm": 1.65346360206604, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8841776847839355, + "num_tokens": 257656193.0, + "step": 7069 + }, + { + "epoch": 1.3129062209842153, + "grad_norm": 1.4676954746246338, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8728885054588318, + "num_tokens": 257694804.0, + "step": 7070 + }, + { + "epoch": 1.313091922005571, + "grad_norm": 1.623148798942566, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8766790628433228, + "num_tokens": 257731716.0, + "step": 7071 + }, + { + "epoch": 1.3132776230269267, + "grad_norm": 1.5649850368499756, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8813197612762451, + "num_tokens": 257765394.0, + "step": 7072 + }, + { + "epoch": 1.3134633240482823, + "grad_norm": 1.741467833518982, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8685823678970337, + "num_tokens": 257798066.0, + "step": 7073 + }, + { + "epoch": 1.3136490250696378, + "grad_norm": 1.4827696084976196, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8816649913787842, + "num_tokens": 257835115.0, + "step": 7074 + }, + { + "epoch": 1.3138347260909935, + "grad_norm": 1.6759697198867798, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8549003601074219, + "num_tokens": 257868579.0, + "step": 7075 + }, + { + "epoch": 1.3140204271123492, + "grad_norm": 1.4398540258407593, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8764974474906921, + "num_tokens": 257909014.0, + "step": 7076 + }, + { + "epoch": 1.3142061281337047, + "grad_norm": 1.4949121475219727, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8696670532226562, + "num_tokens": 257949101.0, + "step": 7077 + }, + { + "epoch": 1.3143918291550603, + "grad_norm": 1.5403740406036377, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8769912719726562, + "num_tokens": 257987593.0, + "step": 7078 + }, + { + "epoch": 1.314577530176416, + "grad_norm": 1.5275510549545288, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8720158338546753, + "num_tokens": 258029440.0, + "step": 7079 + }, + { + "epoch": 1.3147632311977717, + "grad_norm": 1.7017159461975098, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8649442195892334, + "num_tokens": 258063331.0, + "step": 7080 + }, + { + "epoch": 1.3149489322191272, + "grad_norm": 1.6453542709350586, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8602327108383179, + "num_tokens": 258098174.0, + "step": 7081 + }, + { + "epoch": 1.3151346332404827, + "grad_norm": 1.4067933559417725, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8788043856620789, + "num_tokens": 258140548.0, + "step": 7082 + }, + { + "epoch": 1.3153203342618385, + "grad_norm": 1.4682866334915161, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8702354431152344, + "num_tokens": 258180945.0, + "step": 7083 + }, + { + "epoch": 1.315506035283194, + "grad_norm": 1.6381418704986572, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8618364334106445, + "num_tokens": 258212644.0, + "step": 7084 + }, + { + "epoch": 1.3156917363045497, + "grad_norm": 1.4466067552566528, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8851054906845093, + "num_tokens": 258249949.0, + "step": 7085 + }, + { + "epoch": 1.3158774373259052, + "grad_norm": 1.649274468421936, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8811320066452026, + "num_tokens": 258280661.0, + "step": 7086 + }, + { + "epoch": 1.316063138347261, + "grad_norm": 1.5993711948394775, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8778058290481567, + "num_tokens": 258315262.0, + "step": 7087 + }, + { + "epoch": 1.3162488393686165, + "grad_norm": 1.6741694211959839, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8622879981994629, + "num_tokens": 258354786.0, + "step": 7088 + }, + { + "epoch": 1.3164345403899722, + "grad_norm": 1.6408321857452393, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8804254531860352, + "num_tokens": 258390026.0, + "step": 7089 + }, + { + "epoch": 1.3166202414113277, + "grad_norm": 1.629643440246582, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8701640963554382, + "num_tokens": 258423014.0, + "step": 7090 + }, + { + "epoch": 1.3168059424326835, + "grad_norm": 1.6134963035583496, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8858277797698975, + "num_tokens": 258456469.0, + "step": 7091 + }, + { + "epoch": 1.316991643454039, + "grad_norm": 1.5588268041610718, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.883148729801178, + "num_tokens": 258492538.0, + "step": 7092 + }, + { + "epoch": 1.3171773444753947, + "grad_norm": 1.6498855352401733, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8730291128158569, + "num_tokens": 258529516.0, + "step": 7093 + }, + { + "epoch": 1.3173630454967502, + "grad_norm": 1.5851879119873047, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8766605257987976, + "num_tokens": 258562343.0, + "step": 7094 + }, + { + "epoch": 1.317548746518106, + "grad_norm": 1.5077040195465088, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8858530521392822, + "num_tokens": 258598981.0, + "step": 7095 + }, + { + "epoch": 1.3177344475394615, + "grad_norm": 1.5072928667068481, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8784513473510742, + "num_tokens": 258639895.0, + "step": 7096 + }, + { + "epoch": 1.317920148560817, + "grad_norm": 1.6113529205322266, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8838458061218262, + "num_tokens": 258670986.0, + "step": 7097 + }, + { + "epoch": 1.3181058495821727, + "grad_norm": 1.4632058143615723, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8924046754837036, + "num_tokens": 258707444.0, + "step": 7098 + }, + { + "epoch": 1.3182915506035284, + "grad_norm": 1.4822471141815186, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8720079660415649, + "num_tokens": 258746620.0, + "step": 7099 + }, + { + "epoch": 1.318477251624884, + "grad_norm": 1.5526150465011597, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8756651878356934, + "num_tokens": 258782140.0, + "step": 7100 + }, + { + "epoch": 1.3186629526462395, + "grad_norm": 1.814315915107727, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8719542026519775, + "num_tokens": 258815865.0, + "step": 7101 + }, + { + "epoch": 1.3188486536675952, + "grad_norm": 1.691486120223999, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8768109679222107, + "num_tokens": 258851593.0, + "step": 7102 + }, + { + "epoch": 1.319034354688951, + "grad_norm": 1.5424190759658813, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.882123589515686, + "num_tokens": 258889719.0, + "step": 7103 + }, + { + "epoch": 1.3192200557103064, + "grad_norm": 1.5406190156936646, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8987311124801636, + "num_tokens": 258924137.0, + "step": 7104 + }, + { + "epoch": 1.319405756731662, + "grad_norm": 1.590250849723816, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8612793684005737, + "num_tokens": 258962670.0, + "step": 7105 + }, + { + "epoch": 1.3195914577530177, + "grad_norm": 1.5179646015167236, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8680956959724426, + "num_tokens": 259004585.0, + "step": 7106 + }, + { + "epoch": 1.3197771587743732, + "grad_norm": 1.6281359195709229, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.869274914264679, + "num_tokens": 259041162.0, + "step": 7107 + }, + { + "epoch": 1.319962859795729, + "grad_norm": 1.5278881788253784, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8728564381599426, + "num_tokens": 259081883.0, + "step": 7108 + }, + { + "epoch": 1.3201485608170844, + "grad_norm": 1.6324623823165894, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8768556714057922, + "num_tokens": 259114896.0, + "step": 7109 + }, + { + "epoch": 1.3203342618384402, + "grad_norm": 1.5382238626480103, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8688138127326965, + "num_tokens": 259153987.0, + "step": 7110 + }, + { + "epoch": 1.3205199628597957, + "grad_norm": 1.3661091327667236, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8785572648048401, + "num_tokens": 259195495.0, + "step": 7111 + }, + { + "epoch": 1.3207056638811514, + "grad_norm": 1.836754322052002, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8782339096069336, + "num_tokens": 259223942.0, + "step": 7112 + }, + { + "epoch": 1.320891364902507, + "grad_norm": 1.5491374731063843, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.880203366279602, + "num_tokens": 259261882.0, + "step": 7113 + }, + { + "epoch": 1.3210770659238626, + "grad_norm": 1.4670051336288452, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8864090442657471, + "num_tokens": 259299637.0, + "step": 7114 + }, + { + "epoch": 1.3212627669452182, + "grad_norm": 1.6075364351272583, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8641672134399414, + "num_tokens": 259335867.0, + "step": 7115 + }, + { + "epoch": 1.321448467966574, + "grad_norm": 1.5819697380065918, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8873598575592041, + "num_tokens": 259370993.0, + "step": 7116 + }, + { + "epoch": 1.3216341689879294, + "grad_norm": 1.4112752676010132, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8889704942703247, + "num_tokens": 259407104.0, + "step": 7117 + }, + { + "epoch": 1.3218198700092851, + "grad_norm": 1.498522400856018, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8814526796340942, + "num_tokens": 259443685.0, + "step": 7118 + }, + { + "epoch": 1.3220055710306406, + "grad_norm": 1.5667304992675781, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8806275725364685, + "num_tokens": 259482261.0, + "step": 7119 + }, + { + "epoch": 1.3221912720519962, + "grad_norm": 1.439611792564392, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.882683515548706, + "num_tokens": 259524671.0, + "step": 7120 + }, + { + "epoch": 1.322376973073352, + "grad_norm": 1.4437215328216553, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8801144957542419, + "num_tokens": 259564652.0, + "step": 7121 + }, + { + "epoch": 1.3225626740947076, + "grad_norm": 1.586881160736084, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8759405612945557, + "num_tokens": 259601593.0, + "step": 7122 + }, + { + "epoch": 1.3227483751160631, + "grad_norm": 2.6979610919952393, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8903967142105103, + "num_tokens": 259640585.0, + "step": 7123 + }, + { + "epoch": 1.3229340761374186, + "grad_norm": 1.6002566814422607, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8665962815284729, + "num_tokens": 259675548.0, + "step": 7124 + }, + { + "epoch": 1.3231197771587744, + "grad_norm": 1.7224361896514893, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8743777275085449, + "num_tokens": 259708780.0, + "step": 7125 + }, + { + "epoch": 1.3233054781801301, + "grad_norm": 1.40106201171875, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8919422030448914, + "num_tokens": 259747728.0, + "step": 7126 + }, + { + "epoch": 1.3234911792014856, + "grad_norm": 1.4780174493789673, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8882609009742737, + "num_tokens": 259786004.0, + "step": 7127 + }, + { + "epoch": 1.3236768802228411, + "grad_norm": 1.5700453519821167, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8800315856933594, + "num_tokens": 259819717.0, + "step": 7128 + }, + { + "epoch": 1.3238625812441969, + "grad_norm": 1.5781874656677246, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8721301555633545, + "num_tokens": 259857731.0, + "step": 7129 + }, + { + "epoch": 1.3240482822655524, + "grad_norm": 1.510308027267456, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8902712464332581, + "num_tokens": 259890995.0, + "step": 7130 + }, + { + "epoch": 1.3242339832869081, + "grad_norm": 1.5816675424575806, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8768086433410645, + "num_tokens": 259923674.0, + "step": 7131 + }, + { + "epoch": 1.3244196843082636, + "grad_norm": 1.5327833890914917, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8791801929473877, + "num_tokens": 259960953.0, + "step": 7132 + }, + { + "epoch": 1.3246053853296194, + "grad_norm": 1.4763526916503906, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8773201704025269, + "num_tokens": 260003512.0, + "step": 7133 + }, + { + "epoch": 1.3247910863509749, + "grad_norm": 1.50010347366333, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8754445314407349, + "num_tokens": 260045176.0, + "step": 7134 + }, + { + "epoch": 1.3249767873723306, + "grad_norm": 1.5725771188735962, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8862031698226929, + "num_tokens": 260077176.0, + "step": 7135 + }, + { + "epoch": 1.325162488393686, + "grad_norm": 1.6355139017105103, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8806008100509644, + "num_tokens": 260111557.0, + "step": 7136 + }, + { + "epoch": 1.3253481894150418, + "grad_norm": 1.5027081966400146, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.878934383392334, + "num_tokens": 260148783.0, + "step": 7137 + }, + { + "epoch": 1.3255338904363974, + "grad_norm": 1.3811290264129639, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8911956548690796, + "num_tokens": 260189260.0, + "step": 7138 + }, + { + "epoch": 1.325719591457753, + "grad_norm": 1.7152366638183594, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8719820976257324, + "num_tokens": 260224682.0, + "step": 7139 + }, + { + "epoch": 1.3259052924791086, + "grad_norm": 1.6771342754364014, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8813284635543823, + "num_tokens": 260261074.0, + "step": 7140 + }, + { + "epoch": 1.3260909935004643, + "grad_norm": 1.5039355754852295, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.878372848033905, + "num_tokens": 260303567.0, + "step": 7141 + }, + { + "epoch": 1.3262766945218198, + "grad_norm": 1.5872105360031128, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8701697587966919, + "num_tokens": 260341508.0, + "step": 7142 + }, + { + "epoch": 1.3264623955431754, + "grad_norm": 1.5475021600723267, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8686051368713379, + "num_tokens": 260379399.0, + "step": 7143 + }, + { + "epoch": 1.326648096564531, + "grad_norm": 1.4899460077285767, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8931152820587158, + "num_tokens": 260414036.0, + "step": 7144 + }, + { + "epoch": 1.3268337975858868, + "grad_norm": 1.5078245401382446, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8782799243927002, + "num_tokens": 260450645.0, + "step": 7145 + }, + { + "epoch": 1.3270194986072423, + "grad_norm": 1.5704385042190552, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.862424910068512, + "num_tokens": 260487787.0, + "step": 7146 + }, + { + "epoch": 1.3272051996285978, + "grad_norm": 1.5876520872116089, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8733211755752563, + "num_tokens": 260528916.0, + "step": 7147 + }, + { + "epoch": 1.3273909006499536, + "grad_norm": 1.623962163925171, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8754848837852478, + "num_tokens": 260565845.0, + "step": 7148 + }, + { + "epoch": 1.3275766016713093, + "grad_norm": 1.6551371812820435, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8795299530029297, + "num_tokens": 260597542.0, + "step": 7149 + }, + { + "epoch": 1.3277623026926648, + "grad_norm": 1.6353358030319214, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8612594604492188, + "num_tokens": 260634274.0, + "step": 7150 + }, + { + "epoch": 1.3279480037140203, + "grad_norm": 1.6669806241989136, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8635244369506836, + "num_tokens": 260670774.0, + "step": 7151 + }, + { + "epoch": 1.328133704735376, + "grad_norm": 1.6324090957641602, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8772275447845459, + "num_tokens": 260706727.0, + "step": 7152 + }, + { + "epoch": 1.3283194057567318, + "grad_norm": 1.5938719511032104, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8832744359970093, + "num_tokens": 260742327.0, + "step": 7153 + }, + { + "epoch": 1.3285051067780873, + "grad_norm": 1.6111719608306885, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8654108047485352, + "num_tokens": 260779704.0, + "step": 7154 + }, + { + "epoch": 1.3286908077994428, + "grad_norm": 1.7371766567230225, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8692439794540405, + "num_tokens": 260813965.0, + "step": 7155 + }, + { + "epoch": 1.3288765088207986, + "grad_norm": 1.5206865072250366, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8848209381103516, + "num_tokens": 260849466.0, + "step": 7156 + }, + { + "epoch": 1.329062209842154, + "grad_norm": 1.429966926574707, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8749184012413025, + "num_tokens": 260893554.0, + "step": 7157 + }, + { + "epoch": 1.3292479108635098, + "grad_norm": 1.578478217124939, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8770653009414673, + "num_tokens": 260932606.0, + "step": 7158 + }, + { + "epoch": 1.3294336118848653, + "grad_norm": 1.589257001876831, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8769246339797974, + "num_tokens": 260967625.0, + "step": 7159 + }, + { + "epoch": 1.329619312906221, + "grad_norm": 1.6346075534820557, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8798158168792725, + "num_tokens": 261000899.0, + "step": 7160 + }, + { + "epoch": 1.3298050139275766, + "grad_norm": 1.5636465549468994, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8826010823249817, + "num_tokens": 261036344.0, + "step": 7161 + }, + { + "epoch": 1.3299907149489323, + "grad_norm": 1.6223326921463013, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8863512277603149, + "num_tokens": 261068805.0, + "step": 7162 + }, + { + "epoch": 1.3301764159702878, + "grad_norm": 1.5939322710037231, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8825821876525879, + "num_tokens": 261104781.0, + "step": 7163 + }, + { + "epoch": 1.3303621169916435, + "grad_norm": 1.4903980493545532, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8888009190559387, + "num_tokens": 261141621.0, + "step": 7164 + }, + { + "epoch": 1.330547818012999, + "grad_norm": 1.7222708463668823, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8777549862861633, + "num_tokens": 261175896.0, + "step": 7165 + }, + { + "epoch": 1.3307335190343546, + "grad_norm": 1.538474678993225, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.875716507434845, + "num_tokens": 261213851.0, + "step": 7166 + }, + { + "epoch": 1.3309192200557103, + "grad_norm": 1.5163121223449707, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8897190690040588, + "num_tokens": 261247985.0, + "step": 7167 + }, + { + "epoch": 1.331104921077066, + "grad_norm": 1.4876582622528076, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8719731569290161, + "num_tokens": 261287107.0, + "step": 7168 + }, + { + "epoch": 1.3312906220984215, + "grad_norm": 1.6915804147720337, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8664368391036987, + "num_tokens": 261322374.0, + "step": 7169 + }, + { + "epoch": 1.331476323119777, + "grad_norm": 1.4736539125442505, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8845983147621155, + "num_tokens": 261360606.0, + "step": 7170 + }, + { + "epoch": 1.3316620241411328, + "grad_norm": 1.6698873043060303, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8637502193450928, + "num_tokens": 261395693.0, + "step": 7171 + }, + { + "epoch": 1.3318477251624885, + "grad_norm": 1.4741803407669067, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8760310411453247, + "num_tokens": 261439332.0, + "step": 7172 + }, + { + "epoch": 1.332033426183844, + "grad_norm": 1.6270661354064941, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.886591374874115, + "num_tokens": 261471150.0, + "step": 7173 + }, + { + "epoch": 1.3322191272051995, + "grad_norm": 1.5398989915847778, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.891655445098877, + "num_tokens": 261508057.0, + "step": 7174 + }, + { + "epoch": 1.3324048282265553, + "grad_norm": 1.4972472190856934, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8844852447509766, + "num_tokens": 261545362.0, + "step": 7175 + }, + { + "epoch": 1.332590529247911, + "grad_norm": 1.4723423719406128, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8770947456359863, + "num_tokens": 261584783.0, + "step": 7176 + }, + { + "epoch": 1.3327762302692665, + "grad_norm": 1.5056108236312866, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8822447061538696, + "num_tokens": 261621512.0, + "step": 7177 + }, + { + "epoch": 1.332961931290622, + "grad_norm": 1.6684625148773193, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8819711208343506, + "num_tokens": 261652253.0, + "step": 7178 + }, + { + "epoch": 1.3331476323119777, + "grad_norm": 1.4803166389465332, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.875317394733429, + "num_tokens": 261690978.0, + "step": 7179 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.6755080223083496, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8771916627883911, + "num_tokens": 261724040.0, + "step": 7180 + }, + { + "epoch": 1.333519034354689, + "grad_norm": 1.5203088521957397, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8806188106536865, + "num_tokens": 261763276.0, + "step": 7181 + }, + { + "epoch": 1.3337047353760445, + "grad_norm": 1.648353099822998, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8765445351600647, + "num_tokens": 261799739.0, + "step": 7182 + }, + { + "epoch": 1.3338904363974002, + "grad_norm": 1.4886500835418701, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8857805132865906, + "num_tokens": 261837255.0, + "step": 7183 + }, + { + "epoch": 1.3340761374187557, + "grad_norm": 1.4672726392745972, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8827810883522034, + "num_tokens": 261879558.0, + "step": 7184 + }, + { + "epoch": 1.3342618384401115, + "grad_norm": 1.5363759994506836, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8819560408592224, + "num_tokens": 261913670.0, + "step": 7185 + }, + { + "epoch": 1.334447539461467, + "grad_norm": 1.6007260084152222, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8575542569160461, + "num_tokens": 261948521.0, + "step": 7186 + }, + { + "epoch": 1.3346332404828227, + "grad_norm": 1.7745678424835205, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8591850399971008, + "num_tokens": 261983318.0, + "step": 7187 + }, + { + "epoch": 1.3348189415041782, + "grad_norm": 1.4831466674804688, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8882734775543213, + "num_tokens": 262020200.0, + "step": 7188 + }, + { + "epoch": 1.335004642525534, + "grad_norm": 1.542543649673462, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.882815957069397, + "num_tokens": 262053815.0, + "step": 7189 + }, + { + "epoch": 1.3351903435468895, + "grad_norm": 1.6813997030258179, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8523677587509155, + "num_tokens": 262090504.0, + "step": 7190 + }, + { + "epoch": 1.3353760445682452, + "grad_norm": 1.5665103197097778, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8795047998428345, + "num_tokens": 262126359.0, + "step": 7191 + }, + { + "epoch": 1.3355617455896007, + "grad_norm": 1.4595866203308105, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8851286172866821, + "num_tokens": 262165296.0, + "step": 7192 + }, + { + "epoch": 1.3357474466109562, + "grad_norm": 1.53378427028656, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8776039481163025, + "num_tokens": 262199530.0, + "step": 7193 + }, + { + "epoch": 1.335933147632312, + "grad_norm": 1.4546170234680176, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8893740773200989, + "num_tokens": 262237148.0, + "step": 7194 + }, + { + "epoch": 1.3361188486536677, + "grad_norm": 1.484826683998108, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.863649845123291, + "num_tokens": 262275725.0, + "step": 7195 + }, + { + "epoch": 1.3363045496750232, + "grad_norm": 1.453203797340393, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8821635246276855, + "num_tokens": 262318107.0, + "step": 7196 + }, + { + "epoch": 1.3364902506963787, + "grad_norm": 1.4973641633987427, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.881737470626831, + "num_tokens": 262356495.0, + "step": 7197 + }, + { + "epoch": 1.3366759517177345, + "grad_norm": 1.6409975290298462, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8694352507591248, + "num_tokens": 262392690.0, + "step": 7198 + }, + { + "epoch": 1.3368616527390902, + "grad_norm": 1.4933770895004272, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8832806348800659, + "num_tokens": 262430220.0, + "step": 7199 + }, + { + "epoch": 1.3370473537604457, + "grad_norm": 1.5919694900512695, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.884765625, + "num_tokens": 262461484.0, + "step": 7200 + }, + { + "epoch": 1.3372330547818012, + "grad_norm": 1.6782721281051636, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.871592104434967, + "num_tokens": 262493944.0, + "step": 7201 + }, + { + "epoch": 1.337418755803157, + "grad_norm": 1.582540512084961, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8748359084129333, + "num_tokens": 262530656.0, + "step": 7202 + }, + { + "epoch": 1.3376044568245125, + "grad_norm": 1.5699706077575684, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8751391172409058, + "num_tokens": 262568325.0, + "step": 7203 + }, + { + "epoch": 1.3377901578458682, + "grad_norm": 1.4107376337051392, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8992218971252441, + "num_tokens": 262603179.0, + "step": 7204 + }, + { + "epoch": 1.3379758588672237, + "grad_norm": 1.5153329372406006, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8751178979873657, + "num_tokens": 262640581.0, + "step": 7205 + }, + { + "epoch": 1.3381615598885794, + "grad_norm": 1.4926601648330688, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.892313539981842, + "num_tokens": 262674206.0, + "step": 7206 + }, + { + "epoch": 1.338347260909935, + "grad_norm": 1.6940656900405884, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8697551488876343, + "num_tokens": 262702845.0, + "step": 7207 + }, + { + "epoch": 1.3385329619312907, + "grad_norm": 1.5406842231750488, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8694448471069336, + "num_tokens": 262743135.0, + "step": 7208 + }, + { + "epoch": 1.3387186629526462, + "grad_norm": 1.822283387184143, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8719069361686707, + "num_tokens": 262771341.0, + "step": 7209 + }, + { + "epoch": 1.338904363974002, + "grad_norm": 1.5445560216903687, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8809436559677124, + "num_tokens": 262807843.0, + "step": 7210 + }, + { + "epoch": 1.3390900649953574, + "grad_norm": 1.6103218793869019, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8723723292350769, + "num_tokens": 262844570.0, + "step": 7211 + }, + { + "epoch": 1.3392757660167132, + "grad_norm": 1.5537633895874023, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8882598280906677, + "num_tokens": 262879831.0, + "step": 7212 + }, + { + "epoch": 1.3394614670380687, + "grad_norm": 1.7417486906051636, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.86966472864151, + "num_tokens": 262915562.0, + "step": 7213 + }, + { + "epoch": 1.3396471680594244, + "grad_norm": 1.470583438873291, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8837924599647522, + "num_tokens": 262953655.0, + "step": 7214 + }, + { + "epoch": 1.33983286908078, + "grad_norm": 1.846120834350586, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8672964572906494, + "num_tokens": 262988286.0, + "step": 7215 + }, + { + "epoch": 1.3400185701021354, + "grad_norm": 1.5096662044525146, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8868731260299683, + "num_tokens": 263024357.0, + "step": 7216 + }, + { + "epoch": 1.3402042711234912, + "grad_norm": 1.4475682973861694, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8903393149375916, + "num_tokens": 263062407.0, + "step": 7217 + }, + { + "epoch": 1.340389972144847, + "grad_norm": 1.7852203845977783, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8773046731948853, + "num_tokens": 263090310.0, + "step": 7218 + }, + { + "epoch": 1.3405756731662024, + "grad_norm": 1.4518723487854004, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8735791444778442, + "num_tokens": 263130143.0, + "step": 7219 + }, + { + "epoch": 1.340761374187558, + "grad_norm": 1.3151830434799194, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8847402930259705, + "num_tokens": 263174068.0, + "step": 7220 + }, + { + "epoch": 1.3409470752089137, + "grad_norm": 1.5715675354003906, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8720619678497314, + "num_tokens": 263208344.0, + "step": 7221 + }, + { + "epoch": 1.3411327762302694, + "grad_norm": 1.541419267654419, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8781145811080933, + "num_tokens": 263246913.0, + "step": 7222 + }, + { + "epoch": 1.341318477251625, + "grad_norm": 1.6767374277114868, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8617233037948608, + "num_tokens": 263280131.0, + "step": 7223 + }, + { + "epoch": 1.3415041782729804, + "grad_norm": 1.5302270650863647, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.887980580329895, + "num_tokens": 263317400.0, + "step": 7224 + }, + { + "epoch": 1.3416898792943361, + "grad_norm": 1.4624956846237183, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8708754181861877, + "num_tokens": 263358513.0, + "step": 7225 + }, + { + "epoch": 1.3418755803156919, + "grad_norm": 1.5052428245544434, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8918283581733704, + "num_tokens": 263390549.0, + "step": 7226 + }, + { + "epoch": 1.3420612813370474, + "grad_norm": 1.500958800315857, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8825010657310486, + "num_tokens": 263431073.0, + "step": 7227 + }, + { + "epoch": 1.342246982358403, + "grad_norm": 1.6876062154769897, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8785456418991089, + "num_tokens": 263466459.0, + "step": 7228 + }, + { + "epoch": 1.3424326833797586, + "grad_norm": 1.6299152374267578, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8819666504859924, + "num_tokens": 263500691.0, + "step": 7229 + }, + { + "epoch": 1.3426183844011141, + "grad_norm": 1.4436662197113037, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8821399211883545, + "num_tokens": 263541050.0, + "step": 7230 + }, + { + "epoch": 1.3428040854224699, + "grad_norm": 1.4708857536315918, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8565460443496704, + "num_tokens": 263584919.0, + "step": 7231 + }, + { + "epoch": 1.3429897864438254, + "grad_norm": 1.6079741716384888, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.874071478843689, + "num_tokens": 263618637.0, + "step": 7232 + }, + { + "epoch": 1.3431754874651811, + "grad_norm": 1.5215191841125488, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8779540657997131, + "num_tokens": 263655211.0, + "step": 7233 + }, + { + "epoch": 1.3433611884865366, + "grad_norm": 1.7310445308685303, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8758828043937683, + "num_tokens": 263690936.0, + "step": 7234 + }, + { + "epoch": 1.3435468895078924, + "grad_norm": 1.4878686666488647, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8833692073822021, + "num_tokens": 263731412.0, + "step": 7235 + }, + { + "epoch": 1.3437325905292479, + "grad_norm": 1.607177972793579, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8604382276535034, + "num_tokens": 263768937.0, + "step": 7236 + }, + { + "epoch": 1.3439182915506036, + "grad_norm": 1.5305397510528564, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.883195161819458, + "num_tokens": 263803635.0, + "step": 7237 + }, + { + "epoch": 1.3441039925719591, + "grad_norm": 1.478274941444397, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8739035725593567, + "num_tokens": 263844034.0, + "step": 7238 + }, + { + "epoch": 1.3442896935933146, + "grad_norm": 1.6869407892227173, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8719024658203125, + "num_tokens": 263879278.0, + "step": 7239 + }, + { + "epoch": 1.3444753946146704, + "grad_norm": 1.5078325271606445, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8859844207763672, + "num_tokens": 263917065.0, + "step": 7240 + }, + { + "epoch": 1.344661095636026, + "grad_norm": 1.643633246421814, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8581223487854004, + "num_tokens": 263953039.0, + "step": 7241 + }, + { + "epoch": 1.3448467966573816, + "grad_norm": 1.617191195487976, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8680556416511536, + "num_tokens": 263993204.0, + "step": 7242 + }, + { + "epoch": 1.3450324976787371, + "grad_norm": 1.5665708780288696, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8817784786224365, + "num_tokens": 264026112.0, + "step": 7243 + }, + { + "epoch": 1.3452181987000928, + "grad_norm": 1.656496524810791, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8712799549102783, + "num_tokens": 264060605.0, + "step": 7244 + }, + { + "epoch": 1.3454038997214486, + "grad_norm": 1.4014198780059814, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8816922307014465, + "num_tokens": 264102093.0, + "step": 7245 + }, + { + "epoch": 1.345589600742804, + "grad_norm": 1.5787137746810913, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8613173961639404, + "num_tokens": 264138693.0, + "step": 7246 + }, + { + "epoch": 1.3457753017641596, + "grad_norm": 1.6200000047683716, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.887535810470581, + "num_tokens": 264171756.0, + "step": 7247 + }, + { + "epoch": 1.3459610027855153, + "grad_norm": 1.5259835720062256, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8751082420349121, + "num_tokens": 264210208.0, + "step": 7248 + }, + { + "epoch": 1.346146703806871, + "grad_norm": 1.4589430093765259, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8706008195877075, + "num_tokens": 264251463.0, + "step": 7249 + }, + { + "epoch": 1.3463324048282266, + "grad_norm": 1.4918209314346313, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.879562258720398, + "num_tokens": 264289411.0, + "step": 7250 + }, + { + "epoch": 1.346518105849582, + "grad_norm": 1.7094321250915527, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8751899003982544, + "num_tokens": 264322098.0, + "step": 7251 + }, + { + "epoch": 1.3467038068709378, + "grad_norm": 1.5818301439285278, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.863541841506958, + "num_tokens": 264362262.0, + "step": 7252 + }, + { + "epoch": 1.3468895078922933, + "grad_norm": 1.7069087028503418, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.873085081577301, + "num_tokens": 264395026.0, + "step": 7253 + }, + { + "epoch": 1.347075208913649, + "grad_norm": 1.4219516515731812, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8914656639099121, + "num_tokens": 264432461.0, + "step": 7254 + }, + { + "epoch": 1.3472609099350046, + "grad_norm": 1.6095325946807861, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8684759140014648, + "num_tokens": 264468457.0, + "step": 7255 + }, + { + "epoch": 1.3474466109563603, + "grad_norm": 1.710779070854187, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8714761734008789, + "num_tokens": 264500242.0, + "step": 7256 + }, + { + "epoch": 1.3476323119777158, + "grad_norm": 1.6794530153274536, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8861413598060608, + "num_tokens": 264533315.0, + "step": 7257 + }, + { + "epoch": 1.3478180129990716, + "grad_norm": 1.7218246459960938, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8747619390487671, + "num_tokens": 264566715.0, + "step": 7258 + }, + { + "epoch": 1.348003714020427, + "grad_norm": 1.6407630443572998, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8772789239883423, + "num_tokens": 264597827.0, + "step": 7259 + }, + { + "epoch": 1.3481894150417828, + "grad_norm": 1.5791683197021484, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8757396936416626, + "num_tokens": 264633400.0, + "step": 7260 + }, + { + "epoch": 1.3483751160631383, + "grad_norm": 1.5478332042694092, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.879025399684906, + "num_tokens": 264669806.0, + "step": 7261 + }, + { + "epoch": 1.348560817084494, + "grad_norm": 1.5969326496124268, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8791307210922241, + "num_tokens": 264707635.0, + "step": 7262 + }, + { + "epoch": 1.3487465181058496, + "grad_norm": 1.6267666816711426, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8710025548934937, + "num_tokens": 264744496.0, + "step": 7263 + }, + { + "epoch": 1.3489322191272053, + "grad_norm": 1.6239140033721924, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8775418996810913, + "num_tokens": 264780382.0, + "step": 7264 + }, + { + "epoch": 1.3491179201485608, + "grad_norm": 1.5898364782333374, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8675932288169861, + "num_tokens": 264816255.0, + "step": 7265 + }, + { + "epoch": 1.3493036211699163, + "grad_norm": 1.4934364557266235, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8758009076118469, + "num_tokens": 264856678.0, + "step": 7266 + }, + { + "epoch": 1.349489322191272, + "grad_norm": 1.4660307168960571, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8724978566169739, + "num_tokens": 264901693.0, + "step": 7267 + }, + { + "epoch": 1.3496750232126278, + "grad_norm": 1.4234199523925781, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8933402895927429, + "num_tokens": 264938497.0, + "step": 7268 + }, + { + "epoch": 1.3498607242339833, + "grad_norm": 1.3860565423965454, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8781592845916748, + "num_tokens": 264982640.0, + "step": 7269 + }, + { + "epoch": 1.3500464252553388, + "grad_norm": 1.6584769487380981, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8918747901916504, + "num_tokens": 265013164.0, + "step": 7270 + }, + { + "epoch": 1.3502321262766945, + "grad_norm": 1.492310643196106, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8712278008460999, + "num_tokens": 265053395.0, + "step": 7271 + }, + { + "epoch": 1.3504178272980503, + "grad_norm": 1.5408246517181396, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8813283443450928, + "num_tokens": 265091174.0, + "step": 7272 + }, + { + "epoch": 1.3506035283194058, + "grad_norm": 1.524666428565979, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8782860040664673, + "num_tokens": 265132079.0, + "step": 7273 + }, + { + "epoch": 1.3507892293407613, + "grad_norm": 1.6130516529083252, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8830689191818237, + "num_tokens": 265167014.0, + "step": 7274 + }, + { + "epoch": 1.350974930362117, + "grad_norm": 1.562832236289978, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8685949444770813, + "num_tokens": 265206015.0, + "step": 7275 + }, + { + "epoch": 1.3511606313834725, + "grad_norm": 1.5705933570861816, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8820815682411194, + "num_tokens": 265242282.0, + "step": 7276 + }, + { + "epoch": 1.3513463324048283, + "grad_norm": 1.5535964965820312, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8793590068817139, + "num_tokens": 265278310.0, + "step": 7277 + }, + { + "epoch": 1.3515320334261838, + "grad_norm": 1.5502928495407104, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8860175609588623, + "num_tokens": 265313171.0, + "step": 7278 + }, + { + "epoch": 1.3517177344475395, + "grad_norm": 1.7294334173202515, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8579776287078857, + "num_tokens": 265345502.0, + "step": 7279 + }, + { + "epoch": 1.351903435468895, + "grad_norm": 1.5238642692565918, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8782535791397095, + "num_tokens": 265381898.0, + "step": 7280 + }, + { + "epoch": 1.3520891364902508, + "grad_norm": 1.5575753450393677, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8797906637191772, + "num_tokens": 265417317.0, + "step": 7281 + }, + { + "epoch": 1.3522748375116063, + "grad_norm": 1.5777419805526733, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.877935528755188, + "num_tokens": 265455106.0, + "step": 7282 + }, + { + "epoch": 1.352460538532962, + "grad_norm": 1.656044363975525, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8730627298355103, + "num_tokens": 265487692.0, + "step": 7283 + }, + { + "epoch": 1.3526462395543175, + "grad_norm": 1.4538497924804688, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8952609300613403, + "num_tokens": 265525919.0, + "step": 7284 + }, + { + "epoch": 1.3528319405756732, + "grad_norm": 1.5087448358535767, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8761829137802124, + "num_tokens": 265562613.0, + "step": 7285 + }, + { + "epoch": 1.3530176415970288, + "grad_norm": 1.611141562461853, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8573700785636902, + "num_tokens": 265597868.0, + "step": 7286 + }, + { + "epoch": 1.3532033426183845, + "grad_norm": 1.6591477394104004, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8640837073326111, + "num_tokens": 265633893.0, + "step": 7287 + }, + { + "epoch": 1.35338904363974, + "grad_norm": 1.6368169784545898, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8529675006866455, + "num_tokens": 265672453.0, + "step": 7288 + }, + { + "epoch": 1.3535747446610955, + "grad_norm": 1.4547595977783203, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8805819153785706, + "num_tokens": 265710449.0, + "step": 7289 + }, + { + "epoch": 1.3537604456824512, + "grad_norm": 1.419676423072815, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8867791891098022, + "num_tokens": 265749932.0, + "step": 7290 + }, + { + "epoch": 1.353946146703807, + "grad_norm": 1.6509230136871338, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8704005479812622, + "num_tokens": 265780047.0, + "step": 7291 + }, + { + "epoch": 1.3541318477251625, + "grad_norm": 1.5363185405731201, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8711025714874268, + "num_tokens": 265817537.0, + "step": 7292 + }, + { + "epoch": 1.354317548746518, + "grad_norm": 1.6824581623077393, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8793154954910278, + "num_tokens": 265847528.0, + "step": 7293 + }, + { + "epoch": 1.3545032497678737, + "grad_norm": 1.744956135749817, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8668968677520752, + "num_tokens": 265879173.0, + "step": 7294 + }, + { + "epoch": 1.3546889507892295, + "grad_norm": 1.5659866333007812, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8838030695915222, + "num_tokens": 265917377.0, + "step": 7295 + }, + { + "epoch": 1.354874651810585, + "grad_norm": 1.5813268423080444, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8754748106002808, + "num_tokens": 265952853.0, + "step": 7296 + }, + { + "epoch": 1.3550603528319405, + "grad_norm": 1.7528997659683228, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8769253492355347, + "num_tokens": 265988608.0, + "step": 7297 + }, + { + "epoch": 1.3552460538532962, + "grad_norm": 1.636603832244873, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.886479377746582, + "num_tokens": 266024466.0, + "step": 7298 + }, + { + "epoch": 1.3554317548746517, + "grad_norm": 1.4791418313980103, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8867780566215515, + "num_tokens": 266064237.0, + "step": 7299 + }, + { + "epoch": 1.3556174558960075, + "grad_norm": 1.626969337463379, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8646072149276733, + "num_tokens": 266099042.0, + "step": 7300 + }, + { + "epoch": 1.355803156917363, + "grad_norm": 1.7229323387145996, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8735960721969604, + "num_tokens": 266132587.0, + "step": 7301 + }, + { + "epoch": 1.3559888579387187, + "grad_norm": 1.5719016790390015, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8768593668937683, + "num_tokens": 266171420.0, + "step": 7302 + }, + { + "epoch": 1.3561745589600742, + "grad_norm": 1.672530174255371, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.880577027797699, + "num_tokens": 266203889.0, + "step": 7303 + }, + { + "epoch": 1.35636025998143, + "grad_norm": 1.6654679775238037, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8674476146697998, + "num_tokens": 266241024.0, + "step": 7304 + }, + { + "epoch": 1.3565459610027855, + "grad_norm": 1.5204336643218994, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.871783971786499, + "num_tokens": 266279246.0, + "step": 7305 + }, + { + "epoch": 1.3567316620241412, + "grad_norm": 1.627359390258789, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8846312761306763, + "num_tokens": 266312414.0, + "step": 7306 + }, + { + "epoch": 1.3569173630454967, + "grad_norm": 1.7513232231140137, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8564403653144836, + "num_tokens": 266345556.0, + "step": 7307 + }, + { + "epoch": 1.3571030640668524, + "grad_norm": 1.631072759628296, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8749386072158813, + "num_tokens": 266379665.0, + "step": 7308 + }, + { + "epoch": 1.357288765088208, + "grad_norm": 1.6277662515640259, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8722963929176331, + "num_tokens": 266416492.0, + "step": 7309 + }, + { + "epoch": 1.3574744661095637, + "grad_norm": 1.5537911653518677, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8695788383483887, + "num_tokens": 266456902.0, + "step": 7310 + }, + { + "epoch": 1.3576601671309192, + "grad_norm": 1.5829894542694092, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8837392926216125, + "num_tokens": 266489326.0, + "step": 7311 + }, + { + "epoch": 1.3578458681522747, + "grad_norm": 1.6377606391906738, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.87973552942276, + "num_tokens": 266523166.0, + "step": 7312 + }, + { + "epoch": 1.3580315691736304, + "grad_norm": 1.4825513362884521, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8672468662261963, + "num_tokens": 266563070.0, + "step": 7313 + }, + { + "epoch": 1.3582172701949862, + "grad_norm": 1.5026952028274536, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8942337036132812, + "num_tokens": 266596818.0, + "step": 7314 + }, + { + "epoch": 1.3584029712163417, + "grad_norm": 1.4788768291473389, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8787911534309387, + "num_tokens": 266637497.0, + "step": 7315 + }, + { + "epoch": 1.3585886722376972, + "grad_norm": 1.6703505516052246, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.869909405708313, + "num_tokens": 266669886.0, + "step": 7316 + }, + { + "epoch": 1.358774373259053, + "grad_norm": 1.4638599157333374, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8858200907707214, + "num_tokens": 266708885.0, + "step": 7317 + }, + { + "epoch": 1.3589600742804087, + "grad_norm": 1.5401238203048706, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8742808699607849, + "num_tokens": 266745872.0, + "step": 7318 + }, + { + "epoch": 1.3591457753017642, + "grad_norm": 1.6137140989303589, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8825108408927917, + "num_tokens": 266778960.0, + "step": 7319 + }, + { + "epoch": 1.3593314763231197, + "grad_norm": 1.7340476512908936, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8618127703666687, + "num_tokens": 266812749.0, + "step": 7320 + }, + { + "epoch": 1.3595171773444754, + "grad_norm": 1.6693562269210815, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8587482571601868, + "num_tokens": 266847803.0, + "step": 7321 + }, + { + "epoch": 1.3597028783658311, + "grad_norm": 1.4254815578460693, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8852499127388, + "num_tokens": 266888439.0, + "step": 7322 + }, + { + "epoch": 1.3598885793871867, + "grad_norm": 1.5823637247085571, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8713014125823975, + "num_tokens": 266926712.0, + "step": 7323 + }, + { + "epoch": 1.3600742804085422, + "grad_norm": 1.560129165649414, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8811314105987549, + "num_tokens": 266962314.0, + "step": 7324 + }, + { + "epoch": 1.360259981429898, + "grad_norm": 1.60292649269104, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8683656454086304, + "num_tokens": 267004051.0, + "step": 7325 + }, + { + "epoch": 1.3604456824512534, + "grad_norm": 1.569517970085144, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8718032836914062, + "num_tokens": 267039791.0, + "step": 7326 + }, + { + "epoch": 1.3606313834726091, + "grad_norm": 1.5712460279464722, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.864363431930542, + "num_tokens": 267078381.0, + "step": 7327 + }, + { + "epoch": 1.3608170844939647, + "grad_norm": 1.5348906517028809, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8722871541976929, + "num_tokens": 267116625.0, + "step": 7328 + }, + { + "epoch": 1.3610027855153204, + "grad_norm": 1.4142658710479736, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8895806670188904, + "num_tokens": 267155441.0, + "step": 7329 + }, + { + "epoch": 1.361188486536676, + "grad_norm": 1.5859262943267822, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8848245739936829, + "num_tokens": 267188929.0, + "step": 7330 + }, + { + "epoch": 1.3613741875580316, + "grad_norm": 1.6209099292755127, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8808318376541138, + "num_tokens": 267219145.0, + "step": 7331 + }, + { + "epoch": 1.3615598885793871, + "grad_norm": 1.4663230180740356, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8727898597717285, + "num_tokens": 267264474.0, + "step": 7332 + }, + { + "epoch": 1.3617455896007429, + "grad_norm": 1.5922173261642456, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8914459347724915, + "num_tokens": 267302252.0, + "step": 7333 + }, + { + "epoch": 1.3619312906220984, + "grad_norm": 1.5684419870376587, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8760160207748413, + "num_tokens": 267340040.0, + "step": 7334 + }, + { + "epoch": 1.362116991643454, + "grad_norm": 1.429097294807434, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8893312811851501, + "num_tokens": 267381717.0, + "step": 7335 + }, + { + "epoch": 1.3623026926648096, + "grad_norm": 1.599522590637207, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8753371834754944, + "num_tokens": 267418200.0, + "step": 7336 + }, + { + "epoch": 1.3624883936861654, + "grad_norm": 1.5764880180358887, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8672217130661011, + "num_tokens": 267455755.0, + "step": 7337 + }, + { + "epoch": 1.3626740947075209, + "grad_norm": 1.6767910718917847, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.874708354473114, + "num_tokens": 267488781.0, + "step": 7338 + }, + { + "epoch": 1.3628597957288764, + "grad_norm": 1.4537403583526611, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8726654052734375, + "num_tokens": 267532885.0, + "step": 7339 + }, + { + "epoch": 1.3630454967502321, + "grad_norm": 1.611998438835144, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8852216005325317, + "num_tokens": 267567418.0, + "step": 7340 + }, + { + "epoch": 1.3632311977715879, + "grad_norm": 1.5228500366210938, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8849833011627197, + "num_tokens": 267603106.0, + "step": 7341 + }, + { + "epoch": 1.3634168987929434, + "grad_norm": 1.7652884721755981, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8621394634246826, + "num_tokens": 267634195.0, + "step": 7342 + }, + { + "epoch": 1.3636025998142989, + "grad_norm": 1.5840424299240112, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8786624670028687, + "num_tokens": 267669043.0, + "step": 7343 + }, + { + "epoch": 1.3637883008356546, + "grad_norm": 1.7782154083251953, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8670156002044678, + "num_tokens": 267699614.0, + "step": 7344 + }, + { + "epoch": 1.3639740018570103, + "grad_norm": 1.545880913734436, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8821980953216553, + "num_tokens": 267742537.0, + "step": 7345 + }, + { + "epoch": 1.3641597028783659, + "grad_norm": 1.4576728343963623, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8767671585083008, + "num_tokens": 267783614.0, + "step": 7346 + }, + { + "epoch": 1.3643454038997214, + "grad_norm": 1.5800666809082031, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8824161291122437, + "num_tokens": 267816689.0, + "step": 7347 + }, + { + "epoch": 1.364531104921077, + "grad_norm": 1.6637680530548096, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8790649771690369, + "num_tokens": 267848601.0, + "step": 7348 + }, + { + "epoch": 1.3647168059424326, + "grad_norm": 1.4972922801971436, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8898707628250122, + "num_tokens": 267884717.0, + "step": 7349 + }, + { + "epoch": 1.3649025069637883, + "grad_norm": 1.6825613975524902, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8747566342353821, + "num_tokens": 267915601.0, + "step": 7350 + }, + { + "epoch": 1.3650882079851439, + "grad_norm": 1.7146159410476685, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8709236979484558, + "num_tokens": 267948605.0, + "step": 7351 + }, + { + "epoch": 1.3652739090064996, + "grad_norm": 1.5247876644134521, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8792869448661804, + "num_tokens": 267986258.0, + "step": 7352 + }, + { + "epoch": 1.365459610027855, + "grad_norm": 1.5702764987945557, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8892632722854614, + "num_tokens": 268020510.0, + "step": 7353 + }, + { + "epoch": 1.3656453110492108, + "grad_norm": 1.5653170347213745, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8930709362030029, + "num_tokens": 268054236.0, + "step": 7354 + }, + { + "epoch": 1.3658310120705663, + "grad_norm": 1.6578682661056519, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8844777941703796, + "num_tokens": 268086177.0, + "step": 7355 + }, + { + "epoch": 1.366016713091922, + "grad_norm": 1.5797784328460693, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8668580055236816, + "num_tokens": 268125988.0, + "step": 7356 + }, + { + "epoch": 1.3662024141132776, + "grad_norm": 1.5317869186401367, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.872901201248169, + "num_tokens": 268164799.0, + "step": 7357 + }, + { + "epoch": 1.3663881151346333, + "grad_norm": 1.5355271100997925, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8835118412971497, + "num_tokens": 268199861.0, + "step": 7358 + }, + { + "epoch": 1.3665738161559888, + "grad_norm": 1.4677850008010864, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8826409578323364, + "num_tokens": 268247566.0, + "step": 7359 + }, + { + "epoch": 1.3667595171773446, + "grad_norm": 1.8776494264602661, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.882919192314148, + "num_tokens": 268277347.0, + "step": 7360 + }, + { + "epoch": 1.3669452181987, + "grad_norm": 1.5792487859725952, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.887651264667511, + "num_tokens": 268310555.0, + "step": 7361 + }, + { + "epoch": 1.3671309192200556, + "grad_norm": 1.4705575704574585, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8912352323532104, + "num_tokens": 268346826.0, + "step": 7362 + }, + { + "epoch": 1.3673166202414113, + "grad_norm": 1.7207005023956299, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8678736090660095, + "num_tokens": 268383699.0, + "step": 7363 + }, + { + "epoch": 1.367502321262767, + "grad_norm": 1.6731666326522827, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8740280866622925, + "num_tokens": 268420555.0, + "step": 7364 + }, + { + "epoch": 1.3676880222841226, + "grad_norm": 1.5458464622497559, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8930529356002808, + "num_tokens": 268457706.0, + "step": 7365 + }, + { + "epoch": 1.367873723305478, + "grad_norm": 1.6298449039459229, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8870813846588135, + "num_tokens": 268490758.0, + "step": 7366 + }, + { + "epoch": 1.3680594243268338, + "grad_norm": 1.605323076248169, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.878006637096405, + "num_tokens": 268524553.0, + "step": 7367 + }, + { + "epoch": 1.3682451253481895, + "grad_norm": 1.5672380924224854, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8638116121292114, + "num_tokens": 268560873.0, + "step": 7368 + }, + { + "epoch": 1.368430826369545, + "grad_norm": 1.5064234733581543, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8728603720664978, + "num_tokens": 268600781.0, + "step": 7369 + }, + { + "epoch": 1.3686165273909006, + "grad_norm": 1.4362012147903442, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8719290494918823, + "num_tokens": 268647183.0, + "step": 7370 + }, + { + "epoch": 1.3688022284122563, + "grad_norm": 1.5425461530685425, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8911155462265015, + "num_tokens": 268684888.0, + "step": 7371 + }, + { + "epoch": 1.3689879294336118, + "grad_norm": 1.4673982858657837, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8701892495155334, + "num_tokens": 268728524.0, + "step": 7372 + }, + { + "epoch": 1.3691736304549675, + "grad_norm": 1.4830209016799927, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8877562880516052, + "num_tokens": 268765620.0, + "step": 7373 + }, + { + "epoch": 1.369359331476323, + "grad_norm": 1.4503415822982788, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8781953454017639, + "num_tokens": 268805379.0, + "step": 7374 + }, + { + "epoch": 1.3695450324976788, + "grad_norm": 1.7327545881271362, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8710124492645264, + "num_tokens": 268837619.0, + "step": 7375 + }, + { + "epoch": 1.3697307335190343, + "grad_norm": 1.5121970176696777, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8726480007171631, + "num_tokens": 268876750.0, + "step": 7376 + }, + { + "epoch": 1.36991643454039, + "grad_norm": 1.8202567100524902, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8764146566390991, + "num_tokens": 268906371.0, + "step": 7377 + }, + { + "epoch": 1.3701021355617455, + "grad_norm": 1.5322749614715576, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8817397356033325, + "num_tokens": 268942466.0, + "step": 7378 + }, + { + "epoch": 1.3702878365831013, + "grad_norm": 1.64602792263031, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8805164098739624, + "num_tokens": 268979886.0, + "step": 7379 + }, + { + "epoch": 1.3704735376044568, + "grad_norm": 1.9719902276992798, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8728806972503662, + "num_tokens": 269018832.0, + "step": 7380 + }, + { + "epoch": 1.3706592386258125, + "grad_norm": 1.6141265630722046, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8718787431716919, + "num_tokens": 269057207.0, + "step": 7381 + }, + { + "epoch": 1.370844939647168, + "grad_norm": 1.7428853511810303, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8692865371704102, + "num_tokens": 269089012.0, + "step": 7382 + }, + { + "epoch": 1.3710306406685238, + "grad_norm": 1.7177261114120483, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8762441873550415, + "num_tokens": 269123084.0, + "step": 7383 + }, + { + "epoch": 1.3712163416898793, + "grad_norm": 1.5461666584014893, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8720924854278564, + "num_tokens": 269160582.0, + "step": 7384 + }, + { + "epoch": 1.3714020427112348, + "grad_norm": 1.5001221895217896, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8839502334594727, + "num_tokens": 269196377.0, + "step": 7385 + }, + { + "epoch": 1.3715877437325905, + "grad_norm": 1.4303492307662964, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8796055316925049, + "num_tokens": 269235078.0, + "step": 7386 + }, + { + "epoch": 1.3717734447539462, + "grad_norm": 1.3823001384735107, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8808413743972778, + "num_tokens": 269278203.0, + "step": 7387 + }, + { + "epoch": 1.3719591457753018, + "grad_norm": 1.5558289289474487, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.853848934173584, + "num_tokens": 269317057.0, + "step": 7388 + }, + { + "epoch": 1.3721448467966573, + "grad_norm": 1.562346339225769, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8729425668716431, + "num_tokens": 269352086.0, + "step": 7389 + }, + { + "epoch": 1.372330547818013, + "grad_norm": 1.5710951089859009, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8662077784538269, + "num_tokens": 269390357.0, + "step": 7390 + }, + { + "epoch": 1.3725162488393687, + "grad_norm": 1.4556777477264404, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8702302575111389, + "num_tokens": 269432913.0, + "step": 7391 + }, + { + "epoch": 1.3727019498607242, + "grad_norm": 1.415112853050232, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8867523074150085, + "num_tokens": 269472442.0, + "step": 7392 + }, + { + "epoch": 1.3728876508820798, + "grad_norm": 1.3783905506134033, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8787643909454346, + "num_tokens": 269518016.0, + "step": 7393 + }, + { + "epoch": 1.3730733519034355, + "grad_norm": 1.5952600240707397, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8725529313087463, + "num_tokens": 269552374.0, + "step": 7394 + }, + { + "epoch": 1.3732590529247912, + "grad_norm": 1.4337393045425415, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8871235847473145, + "num_tokens": 269591494.0, + "step": 7395 + }, + { + "epoch": 1.3734447539461467, + "grad_norm": 1.4680871963500977, + "learning_rate": 1e-06, + "loss": 0.2673, + "mean_token_accuracy": 0.9028042554855347, + "num_tokens": 269623564.0, + "step": 7396 + }, + { + "epoch": 1.3736304549675022, + "grad_norm": 1.6453661918640137, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8804613351821899, + "num_tokens": 269656912.0, + "step": 7397 + }, + { + "epoch": 1.373816155988858, + "grad_norm": 1.5714786052703857, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8783935308456421, + "num_tokens": 269692847.0, + "step": 7398 + }, + { + "epoch": 1.3740018570102135, + "grad_norm": 1.599002718925476, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8755946159362793, + "num_tokens": 269726682.0, + "step": 7399 + }, + { + "epoch": 1.3741875580315692, + "grad_norm": 1.4445104598999023, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8924245834350586, + "num_tokens": 269765286.0, + "step": 7400 + }, + { + "epoch": 1.3743732590529247, + "grad_norm": 1.6062045097351074, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8755501508712769, + "num_tokens": 269800876.0, + "step": 7401 + }, + { + "epoch": 1.3745589600742805, + "grad_norm": 1.6147924661636353, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8802294731140137, + "num_tokens": 269836421.0, + "step": 7402 + }, + { + "epoch": 1.374744661095636, + "grad_norm": 1.6387377977371216, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8757501244544983, + "num_tokens": 269871812.0, + "step": 7403 + }, + { + "epoch": 1.3749303621169917, + "grad_norm": 1.6125576496124268, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8613371849060059, + "num_tokens": 269906465.0, + "step": 7404 + }, + { + "epoch": 1.3751160631383472, + "grad_norm": 1.4336010217666626, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8819965124130249, + "num_tokens": 269945984.0, + "step": 7405 + }, + { + "epoch": 1.375301764159703, + "grad_norm": 1.5606833696365356, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.876969575881958, + "num_tokens": 269983453.0, + "step": 7406 + }, + { + "epoch": 1.3754874651810585, + "grad_norm": 1.7525842189788818, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8643479347229004, + "num_tokens": 270016613.0, + "step": 7407 + }, + { + "epoch": 1.375673166202414, + "grad_norm": 1.6016522645950317, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8807205557823181, + "num_tokens": 270050239.0, + "step": 7408 + }, + { + "epoch": 1.3758588672237697, + "grad_norm": 1.5220946073532104, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8907551169395447, + "num_tokens": 270085286.0, + "step": 7409 + }, + { + "epoch": 1.3760445682451254, + "grad_norm": 1.4559298753738403, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8848047852516174, + "num_tokens": 270123951.0, + "step": 7410 + }, + { + "epoch": 1.376230269266481, + "grad_norm": 1.7495005130767822, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8727545738220215, + "num_tokens": 270157408.0, + "step": 7411 + }, + { + "epoch": 1.3764159702878365, + "grad_norm": 1.5402076244354248, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8775618076324463, + "num_tokens": 270195606.0, + "step": 7412 + }, + { + "epoch": 1.3766016713091922, + "grad_norm": 1.6397532224655151, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8802255988121033, + "num_tokens": 270229423.0, + "step": 7413 + }, + { + "epoch": 1.376787372330548, + "grad_norm": 1.5429767370224, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8787473440170288, + "num_tokens": 270265138.0, + "step": 7414 + }, + { + "epoch": 1.3769730733519034, + "grad_norm": 1.652194857597351, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8604909181594849, + "num_tokens": 270302692.0, + "step": 7415 + }, + { + "epoch": 1.377158774373259, + "grad_norm": 1.5411049127578735, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8698573112487793, + "num_tokens": 270348097.0, + "step": 7416 + }, + { + "epoch": 1.3773444753946147, + "grad_norm": 1.661409616470337, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8636496067047119, + "num_tokens": 270380654.0, + "step": 7417 + }, + { + "epoch": 1.3775301764159704, + "grad_norm": 1.403468132019043, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.892082691192627, + "num_tokens": 270419259.0, + "step": 7418 + }, + { + "epoch": 1.377715877437326, + "grad_norm": 1.6625006198883057, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8768653869628906, + "num_tokens": 270454227.0, + "step": 7419 + }, + { + "epoch": 1.3779015784586814, + "grad_norm": 1.4432145357131958, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8740447759628296, + "num_tokens": 270493650.0, + "step": 7420 + }, + { + "epoch": 1.3780872794800372, + "grad_norm": 1.499815821647644, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8747568130493164, + "num_tokens": 270534790.0, + "step": 7421 + }, + { + "epoch": 1.3782729805013927, + "grad_norm": 1.503027319908142, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8599894642829895, + "num_tokens": 270578115.0, + "step": 7422 + }, + { + "epoch": 1.3784586815227484, + "grad_norm": 1.5351094007492065, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8738030195236206, + "num_tokens": 270615492.0, + "step": 7423 + }, + { + "epoch": 1.378644382544104, + "grad_norm": 1.5486246347427368, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8665326833724976, + "num_tokens": 270654068.0, + "step": 7424 + }, + { + "epoch": 1.3788300835654597, + "grad_norm": 1.578047513961792, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8726290464401245, + "num_tokens": 270690013.0, + "step": 7425 + }, + { + "epoch": 1.3790157845868152, + "grad_norm": 1.7743570804595947, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8890491127967834, + "num_tokens": 270717974.0, + "step": 7426 + }, + { + "epoch": 1.379201485608171, + "grad_norm": 1.651556372642517, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.879517674446106, + "num_tokens": 270753940.0, + "step": 7427 + }, + { + "epoch": 1.3793871866295264, + "grad_norm": 1.6653839349746704, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8556118011474609, + "num_tokens": 270788021.0, + "step": 7428 + }, + { + "epoch": 1.3795728876508822, + "grad_norm": 1.5305964946746826, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.866946280002594, + "num_tokens": 270825165.0, + "step": 7429 + }, + { + "epoch": 1.3797585886722377, + "grad_norm": 1.7107566595077515, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8756747841835022, + "num_tokens": 270853428.0, + "step": 7430 + }, + { + "epoch": 1.3799442896935934, + "grad_norm": 1.5258795022964478, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8752435445785522, + "num_tokens": 270892896.0, + "step": 7431 + }, + { + "epoch": 1.380129990714949, + "grad_norm": 1.6049753427505493, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8747133016586304, + "num_tokens": 270927982.0, + "step": 7432 + }, + { + "epoch": 1.3803156917363046, + "grad_norm": 1.6236648559570312, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8882321119308472, + "num_tokens": 270961610.0, + "step": 7433 + }, + { + "epoch": 1.3805013927576602, + "grad_norm": 1.5738410949707031, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8917046189308167, + "num_tokens": 270993421.0, + "step": 7434 + }, + { + "epoch": 1.3806870937790157, + "grad_norm": 1.6604689359664917, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8562784194946289, + "num_tokens": 271030246.0, + "step": 7435 + }, + { + "epoch": 1.3808727948003714, + "grad_norm": 1.4750694036483765, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8910127878189087, + "num_tokens": 271068406.0, + "step": 7436 + }, + { + "epoch": 1.3810584958217271, + "grad_norm": 1.8398430347442627, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8858455419540405, + "num_tokens": 271098196.0, + "step": 7437 + }, + { + "epoch": 1.3812441968430826, + "grad_norm": 1.4363365173339844, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8856621980667114, + "num_tokens": 271137063.0, + "step": 7438 + }, + { + "epoch": 1.3814298978644381, + "grad_norm": 1.6311705112457275, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.884311854839325, + "num_tokens": 271170178.0, + "step": 7439 + }, + { + "epoch": 1.3816155988857939, + "grad_norm": 1.4495054483413696, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.866943359375, + "num_tokens": 271210952.0, + "step": 7440 + }, + { + "epoch": 1.3818012999071496, + "grad_norm": 1.4685242176055908, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8878868818283081, + "num_tokens": 271251568.0, + "step": 7441 + }, + { + "epoch": 1.3819870009285051, + "grad_norm": 1.5166467428207397, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8603671193122864, + "num_tokens": 271294080.0, + "step": 7442 + }, + { + "epoch": 1.3821727019498606, + "grad_norm": 1.5519834756851196, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8712663650512695, + "num_tokens": 271330732.0, + "step": 7443 + }, + { + "epoch": 1.3823584029712164, + "grad_norm": 1.6822766065597534, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8639860153198242, + "num_tokens": 271363133.0, + "step": 7444 + }, + { + "epoch": 1.3825441039925719, + "grad_norm": 1.4885272979736328, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8775243163108826, + "num_tokens": 271402046.0, + "step": 7445 + }, + { + "epoch": 1.3827298050139276, + "grad_norm": 1.4372997283935547, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8873038291931152, + "num_tokens": 271441165.0, + "step": 7446 + }, + { + "epoch": 1.3829155060352831, + "grad_norm": 1.5416151285171509, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.874130129814148, + "num_tokens": 271478017.0, + "step": 7447 + }, + { + "epoch": 1.3831012070566389, + "grad_norm": 1.6033234596252441, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8640011548995972, + "num_tokens": 271516424.0, + "step": 7448 + }, + { + "epoch": 1.3832869080779944, + "grad_norm": 1.5765395164489746, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8560552597045898, + "num_tokens": 271555977.0, + "step": 7449 + }, + { + "epoch": 1.38347260909935, + "grad_norm": 1.5390169620513916, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8702154159545898, + "num_tokens": 271593796.0, + "step": 7450 + }, + { + "epoch": 1.3836583101207056, + "grad_norm": 1.4977608919143677, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8908154964447021, + "num_tokens": 271630180.0, + "step": 7451 + }, + { + "epoch": 1.3838440111420613, + "grad_norm": 1.4469165802001953, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8770321607589722, + "num_tokens": 271669911.0, + "step": 7452 + }, + { + "epoch": 1.3840297121634169, + "grad_norm": 1.5903825759887695, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8695971369743347, + "num_tokens": 271710001.0, + "step": 7453 + }, + { + "epoch": 1.3842154131847726, + "grad_norm": 1.6158326864242554, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8775560855865479, + "num_tokens": 271743530.0, + "step": 7454 + }, + { + "epoch": 1.384401114206128, + "grad_norm": 1.590693712234497, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8687156438827515, + "num_tokens": 271779696.0, + "step": 7455 + }, + { + "epoch": 1.3845868152274838, + "grad_norm": 1.6657480001449585, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8768312931060791, + "num_tokens": 271812233.0, + "step": 7456 + }, + { + "epoch": 1.3847725162488393, + "grad_norm": 1.5664000511169434, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8660284280776978, + "num_tokens": 271847954.0, + "step": 7457 + }, + { + "epoch": 1.3849582172701949, + "grad_norm": 1.5218695402145386, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8757106065750122, + "num_tokens": 271882940.0, + "step": 7458 + }, + { + "epoch": 1.3851439182915506, + "grad_norm": 1.5362871885299683, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8804633617401123, + "num_tokens": 271918739.0, + "step": 7459 + }, + { + "epoch": 1.3853296193129063, + "grad_norm": 1.5348320007324219, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8842563629150391, + "num_tokens": 271955349.0, + "step": 7460 + }, + { + "epoch": 1.3855153203342618, + "grad_norm": 1.6260535717010498, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8731862306594849, + "num_tokens": 271990135.0, + "step": 7461 + }, + { + "epoch": 1.3857010213556173, + "grad_norm": 1.6046600341796875, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8836705088615417, + "num_tokens": 272023356.0, + "step": 7462 + }, + { + "epoch": 1.385886722376973, + "grad_norm": 1.3745460510253906, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8800086975097656, + "num_tokens": 272065605.0, + "step": 7463 + }, + { + "epoch": 1.3860724233983288, + "grad_norm": 1.5618321895599365, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8712716102600098, + "num_tokens": 272102254.0, + "step": 7464 + }, + { + "epoch": 1.3862581244196843, + "grad_norm": 1.5545659065246582, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8701221942901611, + "num_tokens": 272135836.0, + "step": 7465 + }, + { + "epoch": 1.3864438254410398, + "grad_norm": 1.5459747314453125, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8738036751747131, + "num_tokens": 272172062.0, + "step": 7466 + }, + { + "epoch": 1.3866295264623956, + "grad_norm": 1.3620744943618774, + "learning_rate": 1e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9044365286827087, + "num_tokens": 272212409.0, + "step": 7467 + }, + { + "epoch": 1.386815227483751, + "grad_norm": 1.6324200630187988, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8745847940444946, + "num_tokens": 272247064.0, + "step": 7468 + }, + { + "epoch": 1.3870009285051068, + "grad_norm": 1.5092071294784546, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8878786563873291, + "num_tokens": 272286879.0, + "step": 7469 + }, + { + "epoch": 1.3871866295264623, + "grad_norm": 1.5677452087402344, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.882925808429718, + "num_tokens": 272324027.0, + "step": 7470 + }, + { + "epoch": 1.387372330547818, + "grad_norm": 1.592673420906067, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8811619877815247, + "num_tokens": 272360949.0, + "step": 7471 + }, + { + "epoch": 1.3875580315691736, + "grad_norm": 1.6361204385757446, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8622230887413025, + "num_tokens": 272397457.0, + "step": 7472 + }, + { + "epoch": 1.3877437325905293, + "grad_norm": 1.47310209274292, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8773552775382996, + "num_tokens": 272435483.0, + "step": 7473 + }, + { + "epoch": 1.3879294336118848, + "grad_norm": 1.475907564163208, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8688818216323853, + "num_tokens": 272480894.0, + "step": 7474 + }, + { + "epoch": 1.3881151346332405, + "grad_norm": 1.527258038520813, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8718777894973755, + "num_tokens": 272519124.0, + "step": 7475 + }, + { + "epoch": 1.388300835654596, + "grad_norm": 1.5796916484832764, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.887751579284668, + "num_tokens": 272555766.0, + "step": 7476 + }, + { + "epoch": 1.3884865366759518, + "grad_norm": 1.6462656259536743, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.873018741607666, + "num_tokens": 272593015.0, + "step": 7477 + }, + { + "epoch": 1.3886722376973073, + "grad_norm": 1.7217397689819336, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8693645000457764, + "num_tokens": 272623392.0, + "step": 7478 + }, + { + "epoch": 1.388857938718663, + "grad_norm": 1.7071832418441772, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8752880692481995, + "num_tokens": 272653144.0, + "step": 7479 + }, + { + "epoch": 1.3890436397400185, + "grad_norm": 1.4978307485580444, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8890488147735596, + "num_tokens": 272689078.0, + "step": 7480 + }, + { + "epoch": 1.389229340761374, + "grad_norm": 1.425560474395752, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8978102207183838, + "num_tokens": 272726059.0, + "step": 7481 + }, + { + "epoch": 1.3894150417827298, + "grad_norm": 1.5343855619430542, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8640612363815308, + "num_tokens": 272767992.0, + "step": 7482 + }, + { + "epoch": 1.3896007428040855, + "grad_norm": 1.5286699533462524, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.878433346748352, + "num_tokens": 272806654.0, + "step": 7483 + }, + { + "epoch": 1.389786443825441, + "grad_norm": 1.5056414604187012, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8646514415740967, + "num_tokens": 272845655.0, + "step": 7484 + }, + { + "epoch": 1.3899721448467965, + "grad_norm": 1.647521734237671, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8792527914047241, + "num_tokens": 272877839.0, + "step": 7485 + }, + { + "epoch": 1.3901578458681523, + "grad_norm": 1.679850697517395, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8742047548294067, + "num_tokens": 272916171.0, + "step": 7486 + }, + { + "epoch": 1.390343546889508, + "grad_norm": 1.4411752223968506, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8716515898704529, + "num_tokens": 272958068.0, + "step": 7487 + }, + { + "epoch": 1.3905292479108635, + "grad_norm": 1.5032014846801758, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8833577632904053, + "num_tokens": 272994967.0, + "step": 7488 + }, + { + "epoch": 1.390714948932219, + "grad_norm": 1.7283188104629517, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8518651723861694, + "num_tokens": 273032767.0, + "step": 7489 + }, + { + "epoch": 1.3909006499535748, + "grad_norm": 1.6109099388122559, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8568958044052124, + "num_tokens": 273071828.0, + "step": 7490 + }, + { + "epoch": 1.3910863509749305, + "grad_norm": 1.5762758255004883, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8754658699035645, + "num_tokens": 273107357.0, + "step": 7491 + }, + { + "epoch": 1.391272051996286, + "grad_norm": 1.4560534954071045, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8878320455551147, + "num_tokens": 273150449.0, + "step": 7492 + }, + { + "epoch": 1.3914577530176415, + "grad_norm": 1.4727734327316284, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8735756278038025, + "num_tokens": 273190359.0, + "step": 7493 + }, + { + "epoch": 1.3916434540389973, + "grad_norm": 1.4625399112701416, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8805245161056519, + "num_tokens": 273226050.0, + "step": 7494 + }, + { + "epoch": 1.3918291550603528, + "grad_norm": 1.6531081199645996, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8723604679107666, + "num_tokens": 273261937.0, + "step": 7495 + }, + { + "epoch": 1.3920148560817085, + "grad_norm": 1.539168119430542, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8763058185577393, + "num_tokens": 273297509.0, + "step": 7496 + }, + { + "epoch": 1.392200557103064, + "grad_norm": 1.3358685970306396, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8915914297103882, + "num_tokens": 273336701.0, + "step": 7497 + }, + { + "epoch": 1.3923862581244197, + "grad_norm": 1.5041719675064087, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8772434592247009, + "num_tokens": 273373228.0, + "step": 7498 + }, + { + "epoch": 1.3925719591457753, + "grad_norm": 1.465437650680542, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8828378319740295, + "num_tokens": 273414024.0, + "step": 7499 + }, + { + "epoch": 1.392757660167131, + "grad_norm": 1.4891246557235718, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8785845637321472, + "num_tokens": 273450567.0, + "step": 7500 + }, + { + "epoch": 1.3929433611884865, + "grad_norm": 1.3571420907974243, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8821792602539062, + "num_tokens": 273494589.0, + "step": 7501 + }, + { + "epoch": 1.3931290622098422, + "grad_norm": 1.5341391563415527, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.877619206905365, + "num_tokens": 273531820.0, + "step": 7502 + }, + { + "epoch": 1.3933147632311977, + "grad_norm": 1.4549314975738525, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8794301152229309, + "num_tokens": 273573003.0, + "step": 7503 + }, + { + "epoch": 1.3935004642525533, + "grad_norm": 1.5057960748672485, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8779721856117249, + "num_tokens": 273606740.0, + "step": 7504 + }, + { + "epoch": 1.393686165273909, + "grad_norm": 1.496453881263733, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8781442642211914, + "num_tokens": 273645854.0, + "step": 7505 + }, + { + "epoch": 1.3938718662952647, + "grad_norm": 1.61687171459198, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.86927729845047, + "num_tokens": 273680197.0, + "step": 7506 + }, + { + "epoch": 1.3940575673166202, + "grad_norm": 1.621757984161377, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8823752403259277, + "num_tokens": 273712678.0, + "step": 7507 + }, + { + "epoch": 1.3942432683379757, + "grad_norm": 1.5741528272628784, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8830965757369995, + "num_tokens": 273745950.0, + "step": 7508 + }, + { + "epoch": 1.3944289693593315, + "grad_norm": 1.5520225763320923, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8856195211410522, + "num_tokens": 273782845.0, + "step": 7509 + }, + { + "epoch": 1.3946146703806872, + "grad_norm": 1.4677454233169556, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8727105855941772, + "num_tokens": 273822768.0, + "step": 7510 + }, + { + "epoch": 1.3948003714020427, + "grad_norm": 1.5657010078430176, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8746337890625, + "num_tokens": 273855046.0, + "step": 7511 + }, + { + "epoch": 1.3949860724233982, + "grad_norm": 1.4219833612442017, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.869757890701294, + "num_tokens": 273902041.0, + "step": 7512 + }, + { + "epoch": 1.395171773444754, + "grad_norm": 1.405681848526001, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8743299841880798, + "num_tokens": 273945301.0, + "step": 7513 + }, + { + "epoch": 1.3953574744661097, + "grad_norm": 1.5048481225967407, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8630610704421997, + "num_tokens": 273986121.0, + "step": 7514 + }, + { + "epoch": 1.3955431754874652, + "grad_norm": 1.4748387336730957, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8630870580673218, + "num_tokens": 274025763.0, + "step": 7515 + }, + { + "epoch": 1.3957288765088207, + "grad_norm": 1.6312042474746704, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8524932861328125, + "num_tokens": 274064510.0, + "step": 7516 + }, + { + "epoch": 1.3959145775301764, + "grad_norm": 1.4750514030456543, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8736556172370911, + "num_tokens": 274106374.0, + "step": 7517 + }, + { + "epoch": 1.396100278551532, + "grad_norm": 1.6446750164031982, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.873606264591217, + "num_tokens": 274140055.0, + "step": 7518 + }, + { + "epoch": 1.3962859795728877, + "grad_norm": 1.5452708005905151, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.884682297706604, + "num_tokens": 274172690.0, + "step": 7519 + }, + { + "epoch": 1.3964716805942432, + "grad_norm": 1.4151800870895386, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8800302743911743, + "num_tokens": 274212945.0, + "step": 7520 + }, + { + "epoch": 1.396657381615599, + "grad_norm": 1.463950514793396, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8764539957046509, + "num_tokens": 274250440.0, + "step": 7521 + }, + { + "epoch": 1.3968430826369544, + "grad_norm": 1.377193808555603, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8822736740112305, + "num_tokens": 274293939.0, + "step": 7522 + }, + { + "epoch": 1.3970287836583102, + "grad_norm": 1.4352285861968994, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8858067989349365, + "num_tokens": 274331526.0, + "step": 7523 + }, + { + "epoch": 1.3972144846796657, + "grad_norm": 1.3724771738052368, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8815351724624634, + "num_tokens": 274375862.0, + "step": 7524 + }, + { + "epoch": 1.3974001857010214, + "grad_norm": 1.462647795677185, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8876388072967529, + "num_tokens": 274413927.0, + "step": 7525 + }, + { + "epoch": 1.397585886722377, + "grad_norm": 1.4742728471755981, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8746618032455444, + "num_tokens": 274454194.0, + "step": 7526 + }, + { + "epoch": 1.3977715877437327, + "grad_norm": 1.5433320999145508, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8828964829444885, + "num_tokens": 274487484.0, + "step": 7527 + }, + { + "epoch": 1.3979572887650882, + "grad_norm": 1.4961305856704712, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8856547474861145, + "num_tokens": 274525727.0, + "step": 7528 + }, + { + "epoch": 1.398142989786444, + "grad_norm": 1.5416063070297241, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8738421201705933, + "num_tokens": 274562123.0, + "step": 7529 + }, + { + "epoch": 1.3983286908077994, + "grad_norm": 1.578457236289978, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8711115121841431, + "num_tokens": 274597982.0, + "step": 7530 + }, + { + "epoch": 1.398514391829155, + "grad_norm": 1.3978753089904785, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8910021781921387, + "num_tokens": 274638742.0, + "step": 7531 + }, + { + "epoch": 1.3987000928505107, + "grad_norm": 1.6602694988250732, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8848859071731567, + "num_tokens": 274674011.0, + "step": 7532 + }, + { + "epoch": 1.3988857938718664, + "grad_norm": 1.6951303482055664, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8794373273849487, + "num_tokens": 274705392.0, + "step": 7533 + }, + { + "epoch": 1.399071494893222, + "grad_norm": 1.542906403541565, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8766824007034302, + "num_tokens": 274741796.0, + "step": 7534 + }, + { + "epoch": 1.3992571959145774, + "grad_norm": 1.5027652978897095, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8746217489242554, + "num_tokens": 274779429.0, + "step": 7535 + }, + { + "epoch": 1.3994428969359332, + "grad_norm": 1.4850327968597412, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8709926009178162, + "num_tokens": 274817003.0, + "step": 7536 + }, + { + "epoch": 1.399628597957289, + "grad_norm": 1.6367411613464355, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8770177364349365, + "num_tokens": 274851412.0, + "step": 7537 + }, + { + "epoch": 1.3998142989786444, + "grad_norm": 1.5640467405319214, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8789399266242981, + "num_tokens": 274886379.0, + "step": 7538 + }, + { + "epoch": 1.4, + "grad_norm": 1.642794132232666, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8664563298225403, + "num_tokens": 274920050.0, + "step": 7539 + }, + { + "epoch": 1.4001857010213556, + "grad_norm": 1.4890992641448975, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8745169639587402, + "num_tokens": 274962508.0, + "step": 7540 + }, + { + "epoch": 1.4003714020427112, + "grad_norm": 1.5269277095794678, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8753933906555176, + "num_tokens": 275001778.0, + "step": 7541 + }, + { + "epoch": 1.4005571030640669, + "grad_norm": 1.6524823904037476, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8777617812156677, + "num_tokens": 275033392.0, + "step": 7542 + }, + { + "epoch": 1.4007428040854224, + "grad_norm": 1.6928460597991943, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.870672881603241, + "num_tokens": 275068073.0, + "step": 7543 + }, + { + "epoch": 1.4009285051067781, + "grad_norm": 1.456217646598816, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8891798257827759, + "num_tokens": 275105447.0, + "step": 7544 + }, + { + "epoch": 1.4011142061281336, + "grad_norm": 1.496242642402649, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8835276961326599, + "num_tokens": 275142027.0, + "step": 7545 + }, + { + "epoch": 1.4012999071494894, + "grad_norm": 1.656740427017212, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8821463584899902, + "num_tokens": 275175556.0, + "step": 7546 + }, + { + "epoch": 1.4014856081708449, + "grad_norm": 1.7866129875183105, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8654448986053467, + "num_tokens": 275207719.0, + "step": 7547 + }, + { + "epoch": 1.4016713091922006, + "grad_norm": 1.4558279514312744, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8933255076408386, + "num_tokens": 275244204.0, + "step": 7548 + }, + { + "epoch": 1.4018570102135561, + "grad_norm": 1.4917638301849365, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8796613812446594, + "num_tokens": 275279362.0, + "step": 7549 + }, + { + "epoch": 1.4020427112349119, + "grad_norm": 1.510313868522644, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8878883123397827, + "num_tokens": 275312648.0, + "step": 7550 + }, + { + "epoch": 1.4022284122562674, + "grad_norm": 1.486923336982727, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.886603832244873, + "num_tokens": 275348855.0, + "step": 7551 + }, + { + "epoch": 1.402414113277623, + "grad_norm": 1.5817253589630127, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8676247000694275, + "num_tokens": 275387985.0, + "step": 7552 + }, + { + "epoch": 1.4025998142989786, + "grad_norm": 1.5227283239364624, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8608407974243164, + "num_tokens": 275427735.0, + "step": 7553 + }, + { + "epoch": 1.4027855153203341, + "grad_norm": 1.5364352464675903, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8816983699798584, + "num_tokens": 275466259.0, + "step": 7554 + }, + { + "epoch": 1.4029712163416899, + "grad_norm": 1.5173274278640747, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8863667249679565, + "num_tokens": 275503583.0, + "step": 7555 + }, + { + "epoch": 1.4031569173630456, + "grad_norm": 1.579215168952942, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8756369352340698, + "num_tokens": 275536367.0, + "step": 7556 + }, + { + "epoch": 1.403342618384401, + "grad_norm": 1.4104909896850586, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8897096514701843, + "num_tokens": 275574916.0, + "step": 7557 + }, + { + "epoch": 1.4035283194057566, + "grad_norm": 1.7264026403427124, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8741804957389832, + "num_tokens": 275608799.0, + "step": 7558 + }, + { + "epoch": 1.4037140204271124, + "grad_norm": 1.6701018810272217, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8853421211242676, + "num_tokens": 275639579.0, + "step": 7559 + }, + { + "epoch": 1.403899721448468, + "grad_norm": 1.527479648590088, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8757290840148926, + "num_tokens": 275679039.0, + "step": 7560 + }, + { + "epoch": 1.4040854224698236, + "grad_norm": 1.583665132522583, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8667589426040649, + "num_tokens": 275720880.0, + "step": 7561 + }, + { + "epoch": 1.404271123491179, + "grad_norm": 1.5902940034866333, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8760779500007629, + "num_tokens": 275758821.0, + "step": 7562 + }, + { + "epoch": 1.4044568245125348, + "grad_norm": 1.5453524589538574, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8835939168930054, + "num_tokens": 275792744.0, + "step": 7563 + }, + { + "epoch": 1.4046425255338906, + "grad_norm": 1.4773151874542236, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.873744010925293, + "num_tokens": 275831494.0, + "step": 7564 + }, + { + "epoch": 1.404828226555246, + "grad_norm": 1.6295362710952759, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8794670701026917, + "num_tokens": 275863385.0, + "step": 7565 + }, + { + "epoch": 1.4050139275766016, + "grad_norm": 1.4628322124481201, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.88425213098526, + "num_tokens": 275905833.0, + "step": 7566 + }, + { + "epoch": 1.4051996285979573, + "grad_norm": 1.6756621599197388, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8566645383834839, + "num_tokens": 275942797.0, + "step": 7567 + }, + { + "epoch": 1.4053853296193128, + "grad_norm": 1.7044082880020142, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8778156638145447, + "num_tokens": 275972619.0, + "step": 7568 + }, + { + "epoch": 1.4055710306406686, + "grad_norm": 1.5414994955062866, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8883086442947388, + "num_tokens": 276007203.0, + "step": 7569 + }, + { + "epoch": 1.405756731662024, + "grad_norm": 1.7627716064453125, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8662482500076294, + "num_tokens": 276038305.0, + "step": 7570 + }, + { + "epoch": 1.4059424326833798, + "grad_norm": 1.4145234823226929, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8838778138160706, + "num_tokens": 276079417.0, + "step": 7571 + }, + { + "epoch": 1.4061281337047353, + "grad_norm": 1.6081502437591553, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.8985167145729065, + "num_tokens": 276107669.0, + "step": 7572 + }, + { + "epoch": 1.406313834726091, + "grad_norm": 1.6375495195388794, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.867448091506958, + "num_tokens": 276141273.0, + "step": 7573 + }, + { + "epoch": 1.4064995357474466, + "grad_norm": 1.6493412256240845, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8883172273635864, + "num_tokens": 276171905.0, + "step": 7574 + }, + { + "epoch": 1.4066852367688023, + "grad_norm": 1.4445796012878418, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.871216356754303, + "num_tokens": 276214856.0, + "step": 7575 + }, + { + "epoch": 1.4068709377901578, + "grad_norm": 1.590067982673645, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8688720464706421, + "num_tokens": 276250462.0, + "step": 7576 + }, + { + "epoch": 1.4070566388115133, + "grad_norm": 1.5651642084121704, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8772971034049988, + "num_tokens": 276285064.0, + "step": 7577 + }, + { + "epoch": 1.407242339832869, + "grad_norm": 1.6968268156051636, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8590732216835022, + "num_tokens": 276318014.0, + "step": 7578 + }, + { + "epoch": 1.4074280408542248, + "grad_norm": 1.677663803100586, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8702633380889893, + "num_tokens": 276351774.0, + "step": 7579 + }, + { + "epoch": 1.4076137418755803, + "grad_norm": 1.5880119800567627, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8775015473365784, + "num_tokens": 276388622.0, + "step": 7580 + }, + { + "epoch": 1.4077994428969358, + "grad_norm": 1.690307855606079, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8768857717514038, + "num_tokens": 276417437.0, + "step": 7581 + }, + { + "epoch": 1.4079851439182915, + "grad_norm": 1.519986629486084, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8800485134124756, + "num_tokens": 276453684.0, + "step": 7582 + }, + { + "epoch": 1.4081708449396473, + "grad_norm": 1.5220526456832886, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8670653104782104, + "num_tokens": 276491161.0, + "step": 7583 + }, + { + "epoch": 1.4083565459610028, + "grad_norm": 1.648641586303711, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8785790205001831, + "num_tokens": 276526370.0, + "step": 7584 + }, + { + "epoch": 1.4085422469823583, + "grad_norm": 1.4490907192230225, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8867394924163818, + "num_tokens": 276563326.0, + "step": 7585 + }, + { + "epoch": 1.408727948003714, + "grad_norm": 1.3770999908447266, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.874252438545227, + "num_tokens": 276608123.0, + "step": 7586 + }, + { + "epoch": 1.4089136490250698, + "grad_norm": 1.5386651754379272, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8728407621383667, + "num_tokens": 276648617.0, + "step": 7587 + }, + { + "epoch": 1.4090993500464253, + "grad_norm": 1.5760719776153564, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8631871938705444, + "num_tokens": 276689013.0, + "step": 7588 + }, + { + "epoch": 1.4092850510677808, + "grad_norm": 1.4229247570037842, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8717937469482422, + "num_tokens": 276731025.0, + "step": 7589 + }, + { + "epoch": 1.4094707520891365, + "grad_norm": 1.4016227722167969, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8967006206512451, + "num_tokens": 276770727.0, + "step": 7590 + }, + { + "epoch": 1.409656453110492, + "grad_norm": 1.6592484712600708, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8715678453445435, + "num_tokens": 276806927.0, + "step": 7591 + }, + { + "epoch": 1.4098421541318478, + "grad_norm": 1.6049973964691162, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8630472421646118, + "num_tokens": 276841802.0, + "step": 7592 + }, + { + "epoch": 1.4100278551532033, + "grad_norm": 1.4742122888565063, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8692257404327393, + "num_tokens": 276882914.0, + "step": 7593 + }, + { + "epoch": 1.410213556174559, + "grad_norm": 1.5663524866104126, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8793806433677673, + "num_tokens": 276920315.0, + "step": 7594 + }, + { + "epoch": 1.4103992571959145, + "grad_norm": 1.6370137929916382, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8809047937393188, + "num_tokens": 276954541.0, + "step": 7595 + }, + { + "epoch": 1.4105849582172703, + "grad_norm": 1.6753811836242676, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8615546226501465, + "num_tokens": 276992305.0, + "step": 7596 + }, + { + "epoch": 1.4107706592386258, + "grad_norm": 1.6411356925964355, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.876731812953949, + "num_tokens": 277024407.0, + "step": 7597 + }, + { + "epoch": 1.4109563602599815, + "grad_norm": 1.5838556289672852, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8734356164932251, + "num_tokens": 277061076.0, + "step": 7598 + }, + { + "epoch": 1.411142061281337, + "grad_norm": 1.8092199563980103, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8593953251838684, + "num_tokens": 277090526.0, + "step": 7599 + }, + { + "epoch": 1.4113277623026927, + "grad_norm": 1.620871663093567, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.875701367855072, + "num_tokens": 277128810.0, + "step": 7600 + }, + { + "epoch": 1.4115134633240483, + "grad_norm": 1.5142468214035034, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8850531578063965, + "num_tokens": 277165728.0, + "step": 7601 + }, + { + "epoch": 1.411699164345404, + "grad_norm": 1.7180815935134888, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8644647598266602, + "num_tokens": 277200270.0, + "step": 7602 + }, + { + "epoch": 1.4118848653667595, + "grad_norm": 1.4657124280929565, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8815652132034302, + "num_tokens": 277239560.0, + "step": 7603 + }, + { + "epoch": 1.412070566388115, + "grad_norm": 1.5693175792694092, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8691568374633789, + "num_tokens": 277279450.0, + "step": 7604 + }, + { + "epoch": 1.4122562674094707, + "grad_norm": 1.7491910457611084, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8737360239028931, + "num_tokens": 277312584.0, + "step": 7605 + }, + { + "epoch": 1.4124419684308265, + "grad_norm": 1.651131272315979, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8727519512176514, + "num_tokens": 277346536.0, + "step": 7606 + }, + { + "epoch": 1.412627669452182, + "grad_norm": 1.598456859588623, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8587016463279724, + "num_tokens": 277384115.0, + "step": 7607 + }, + { + "epoch": 1.4128133704735375, + "grad_norm": 1.693352460861206, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8791429400444031, + "num_tokens": 277416068.0, + "step": 7608 + }, + { + "epoch": 1.4129990714948932, + "grad_norm": 1.7041994333267212, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8773518204689026, + "num_tokens": 277447939.0, + "step": 7609 + }, + { + "epoch": 1.413184772516249, + "grad_norm": 1.432332992553711, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8652840852737427, + "num_tokens": 277490923.0, + "step": 7610 + }, + { + "epoch": 1.4133704735376045, + "grad_norm": 1.6743199825286865, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8784985542297363, + "num_tokens": 277525548.0, + "step": 7611 + }, + { + "epoch": 1.41355617455896, + "grad_norm": 1.6349493265151978, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.875914454460144, + "num_tokens": 277558661.0, + "step": 7612 + }, + { + "epoch": 1.4137418755803157, + "grad_norm": 1.6075103282928467, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8782437443733215, + "num_tokens": 277593102.0, + "step": 7613 + }, + { + "epoch": 1.4139275766016712, + "grad_norm": 1.5680334568023682, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8755055665969849, + "num_tokens": 277627775.0, + "step": 7614 + }, + { + "epoch": 1.414113277623027, + "grad_norm": 1.4732229709625244, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8677695989608765, + "num_tokens": 277670738.0, + "step": 7615 + }, + { + "epoch": 1.4142989786443825, + "grad_norm": 1.5716089010238647, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8782804012298584, + "num_tokens": 277704285.0, + "step": 7616 + }, + { + "epoch": 1.4144846796657382, + "grad_norm": 1.5553499460220337, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8756623268127441, + "num_tokens": 277740003.0, + "step": 7617 + }, + { + "epoch": 1.4146703806870937, + "grad_norm": 1.5151190757751465, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8861273527145386, + "num_tokens": 277776626.0, + "step": 7618 + }, + { + "epoch": 1.4148560817084495, + "grad_norm": 1.414093255996704, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8896116614341736, + "num_tokens": 277815984.0, + "step": 7619 + }, + { + "epoch": 1.415041782729805, + "grad_norm": 1.4200208187103271, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8911826610565186, + "num_tokens": 277856945.0, + "step": 7620 + }, + { + "epoch": 1.4152274837511607, + "grad_norm": 1.5555450916290283, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8707351684570312, + "num_tokens": 277892534.0, + "step": 7621 + }, + { + "epoch": 1.4154131847725162, + "grad_norm": 1.479189395904541, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8679303526878357, + "num_tokens": 277936531.0, + "step": 7622 + }, + { + "epoch": 1.415598885793872, + "grad_norm": 1.5068800449371338, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8826500177383423, + "num_tokens": 277972967.0, + "step": 7623 + }, + { + "epoch": 1.4157845868152275, + "grad_norm": 1.5080360174179077, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8854680061340332, + "num_tokens": 278009619.0, + "step": 7624 + }, + { + "epoch": 1.4159702878365832, + "grad_norm": 1.4830442667007446, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.874306321144104, + "num_tokens": 278047803.0, + "step": 7625 + }, + { + "epoch": 1.4161559888579387, + "grad_norm": 1.5885566473007202, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8639586567878723, + "num_tokens": 278087488.0, + "step": 7626 + }, + { + "epoch": 1.4163416898792942, + "grad_norm": 1.4492299556732178, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8791127800941467, + "num_tokens": 278128735.0, + "step": 7627 + }, + { + "epoch": 1.41652739090065, + "grad_norm": 1.598420262336731, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8733502626419067, + "num_tokens": 278166714.0, + "step": 7628 + }, + { + "epoch": 1.4167130919220057, + "grad_norm": 1.5528504848480225, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.876380205154419, + "num_tokens": 278203961.0, + "step": 7629 + }, + { + "epoch": 1.4168987929433612, + "grad_norm": 1.5970715284347534, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8766830563545227, + "num_tokens": 278238244.0, + "step": 7630 + }, + { + "epoch": 1.4170844939647167, + "grad_norm": 1.4620615243911743, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8782263994216919, + "num_tokens": 278279044.0, + "step": 7631 + }, + { + "epoch": 1.4172701949860724, + "grad_norm": 1.4340215921401978, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8782171010971069, + "num_tokens": 278317273.0, + "step": 7632 + }, + { + "epoch": 1.4174558960074282, + "grad_norm": 1.4795715808868408, + "learning_rate": 1e-06, + "loss": 0.2634, + "mean_token_accuracy": 0.904594898223877, + "num_tokens": 278352989.0, + "step": 7633 + }, + { + "epoch": 1.4176415970287837, + "grad_norm": 1.59945547580719, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8718956708908081, + "num_tokens": 278388301.0, + "step": 7634 + }, + { + "epoch": 1.4178272980501392, + "grad_norm": 1.4312869310379028, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8830242156982422, + "num_tokens": 278429333.0, + "step": 7635 + }, + { + "epoch": 1.418012999071495, + "grad_norm": 1.5742793083190918, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8574792146682739, + "num_tokens": 278467611.0, + "step": 7636 + }, + { + "epoch": 1.4181987000928504, + "grad_norm": 1.473851203918457, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8812637329101562, + "num_tokens": 278504121.0, + "step": 7637 + }, + { + "epoch": 1.4183844011142062, + "grad_norm": 1.5236458778381348, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8810372352600098, + "num_tokens": 278537454.0, + "step": 7638 + }, + { + "epoch": 1.4185701021355617, + "grad_norm": 1.4561781883239746, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.870499849319458, + "num_tokens": 278578183.0, + "step": 7639 + }, + { + "epoch": 1.4187558031569174, + "grad_norm": 1.6424109935760498, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8784393072128296, + "num_tokens": 278610645.0, + "step": 7640 + }, + { + "epoch": 1.418941504178273, + "grad_norm": 1.7120306491851807, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8540816307067871, + "num_tokens": 278648290.0, + "step": 7641 + }, + { + "epoch": 1.4191272051996286, + "grad_norm": 1.5176634788513184, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.867734432220459, + "num_tokens": 278690439.0, + "step": 7642 + }, + { + "epoch": 1.4193129062209842, + "grad_norm": 1.749862790107727, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8558420538902283, + "num_tokens": 278727631.0, + "step": 7643 + }, + { + "epoch": 1.41949860724234, + "grad_norm": 1.52665376663208, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8822206258773804, + "num_tokens": 278765501.0, + "step": 7644 + }, + { + "epoch": 1.4196843082636954, + "grad_norm": 1.4767191410064697, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8707148432731628, + "num_tokens": 278803970.0, + "step": 7645 + }, + { + "epoch": 1.4198700092850511, + "grad_norm": 1.5631734132766724, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8634592294692993, + "num_tokens": 278841000.0, + "step": 7646 + }, + { + "epoch": 1.4200557103064066, + "grad_norm": 1.6171265840530396, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8618517518043518, + "num_tokens": 278877924.0, + "step": 7647 + }, + { + "epoch": 1.4202414113277624, + "grad_norm": 1.5080338716506958, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8725497722625732, + "num_tokens": 278916152.0, + "step": 7648 + }, + { + "epoch": 1.420427112349118, + "grad_norm": 1.6248611211776733, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8778688907623291, + "num_tokens": 278947692.0, + "step": 7649 + }, + { + "epoch": 1.4206128133704734, + "grad_norm": 1.7925355434417725, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8474425077438354, + "num_tokens": 278982813.0, + "step": 7650 + }, + { + "epoch": 1.4207985143918291, + "grad_norm": 1.7643378973007202, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8802647590637207, + "num_tokens": 279016862.0, + "step": 7651 + }, + { + "epoch": 1.4209842154131849, + "grad_norm": 1.5692538022994995, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.873353123664856, + "num_tokens": 279051788.0, + "step": 7652 + }, + { + "epoch": 1.4211699164345404, + "grad_norm": 1.6198201179504395, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8759628534317017, + "num_tokens": 279089042.0, + "step": 7653 + }, + { + "epoch": 1.421355617455896, + "grad_norm": 1.6060848236083984, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8783082365989685, + "num_tokens": 279121656.0, + "step": 7654 + }, + { + "epoch": 1.4215413184772516, + "grad_norm": 1.546629786491394, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8762638568878174, + "num_tokens": 279158163.0, + "step": 7655 + }, + { + "epoch": 1.4217270194986074, + "grad_norm": 1.4988881349563599, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.879341185092926, + "num_tokens": 279192137.0, + "step": 7656 + }, + { + "epoch": 1.4219127205199629, + "grad_norm": 1.4995142221450806, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8733858466148376, + "num_tokens": 279228814.0, + "step": 7657 + }, + { + "epoch": 1.4220984215413184, + "grad_norm": 1.545868992805481, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8768255710601807, + "num_tokens": 279268302.0, + "step": 7658 + }, + { + "epoch": 1.4222841225626741, + "grad_norm": 1.6091548204421997, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8938448429107666, + "num_tokens": 279299435.0, + "step": 7659 + }, + { + "epoch": 1.4224698235840298, + "grad_norm": 1.6654115915298462, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8681484460830688, + "num_tokens": 279333675.0, + "step": 7660 + }, + { + "epoch": 1.4226555246053854, + "grad_norm": 1.5857404470443726, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8620010614395142, + "num_tokens": 279371097.0, + "step": 7661 + }, + { + "epoch": 1.4228412256267409, + "grad_norm": 1.4054020643234253, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8792970776557922, + "num_tokens": 279413546.0, + "step": 7662 + }, + { + "epoch": 1.4230269266480966, + "grad_norm": 1.6217589378356934, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8804105520248413, + "num_tokens": 279446483.0, + "step": 7663 + }, + { + "epoch": 1.4232126276694521, + "grad_norm": 1.5427547693252563, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8810541033744812, + "num_tokens": 279484897.0, + "step": 7664 + }, + { + "epoch": 1.4233983286908078, + "grad_norm": 1.5745803117752075, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8623389005661011, + "num_tokens": 279521834.0, + "step": 7665 + }, + { + "epoch": 1.4235840297121634, + "grad_norm": 1.4946173429489136, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8937286138534546, + "num_tokens": 279557257.0, + "step": 7666 + }, + { + "epoch": 1.423769730733519, + "grad_norm": 1.557908058166504, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8699053525924683, + "num_tokens": 279594038.0, + "step": 7667 + }, + { + "epoch": 1.4239554317548746, + "grad_norm": 1.5890984535217285, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8809967041015625, + "num_tokens": 279635322.0, + "step": 7668 + }, + { + "epoch": 1.4241411327762303, + "grad_norm": 1.6015691757202148, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8633972406387329, + "num_tokens": 279672485.0, + "step": 7669 + }, + { + "epoch": 1.4243268337975858, + "grad_norm": 1.5882471799850464, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8759536743164062, + "num_tokens": 279709714.0, + "step": 7670 + }, + { + "epoch": 1.4245125348189416, + "grad_norm": 1.571563959121704, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8864655494689941, + "num_tokens": 279742221.0, + "step": 7671 + }, + { + "epoch": 1.424698235840297, + "grad_norm": 1.4445409774780273, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8762139081954956, + "num_tokens": 279782923.0, + "step": 7672 + }, + { + "epoch": 1.4248839368616528, + "grad_norm": 1.5534683465957642, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.8976097106933594, + "num_tokens": 279811251.0, + "step": 7673 + }, + { + "epoch": 1.4250696378830083, + "grad_norm": 1.6494189500808716, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8756246566772461, + "num_tokens": 279847291.0, + "step": 7674 + }, + { + "epoch": 1.425255338904364, + "grad_norm": 1.6364988088607788, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.876143753528595, + "num_tokens": 279882606.0, + "step": 7675 + }, + { + "epoch": 1.4254410399257196, + "grad_norm": 1.7344176769256592, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8898227214813232, + "num_tokens": 279917587.0, + "step": 7676 + }, + { + "epoch": 1.425626740947075, + "grad_norm": 1.6470879316329956, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8715806007385254, + "num_tokens": 279957249.0, + "step": 7677 + }, + { + "epoch": 1.4258124419684308, + "grad_norm": 1.569414496421814, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8921108841896057, + "num_tokens": 279992512.0, + "step": 7678 + }, + { + "epoch": 1.4259981429897866, + "grad_norm": 1.5778179168701172, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8829284906387329, + "num_tokens": 280027305.0, + "step": 7679 + }, + { + "epoch": 1.426183844011142, + "grad_norm": 1.623583197593689, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.878485918045044, + "num_tokens": 280061278.0, + "step": 7680 + }, + { + "epoch": 1.4263695450324976, + "grad_norm": 1.6678717136383057, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8852811455726624, + "num_tokens": 280092643.0, + "step": 7681 + }, + { + "epoch": 1.4265552460538533, + "grad_norm": 1.4416027069091797, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8926724195480347, + "num_tokens": 280130746.0, + "step": 7682 + }, + { + "epoch": 1.426740947075209, + "grad_norm": 1.5014516115188599, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8783575892448425, + "num_tokens": 280172443.0, + "step": 7683 + }, + { + "epoch": 1.4269266480965646, + "grad_norm": 1.5579206943511963, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8798258900642395, + "num_tokens": 280208309.0, + "step": 7684 + }, + { + "epoch": 1.42711234911792, + "grad_norm": 1.458968162536621, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8845924139022827, + "num_tokens": 280245691.0, + "step": 7685 + }, + { + "epoch": 1.4272980501392758, + "grad_norm": 1.652551531791687, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8623096346855164, + "num_tokens": 280283500.0, + "step": 7686 + }, + { + "epoch": 1.4274837511606313, + "grad_norm": 1.799874186515808, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8761066198348999, + "num_tokens": 280310832.0, + "step": 7687 + }, + { + "epoch": 1.427669452181987, + "grad_norm": 1.5541094541549683, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8957619667053223, + "num_tokens": 280344558.0, + "step": 7688 + }, + { + "epoch": 1.4278551532033426, + "grad_norm": 1.726014494895935, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8661722540855408, + "num_tokens": 280379536.0, + "step": 7689 + }, + { + "epoch": 1.4280408542246983, + "grad_norm": 1.510232925415039, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8761122226715088, + "num_tokens": 280416187.0, + "step": 7690 + }, + { + "epoch": 1.4282265552460538, + "grad_norm": 1.6027849912643433, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8892438411712646, + "num_tokens": 280452145.0, + "step": 7691 + }, + { + "epoch": 1.4284122562674095, + "grad_norm": 1.5563747882843018, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8745352029800415, + "num_tokens": 280490098.0, + "step": 7692 + }, + { + "epoch": 1.428597957288765, + "grad_norm": 1.5446046590805054, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8820533156394958, + "num_tokens": 280525638.0, + "step": 7693 + }, + { + "epoch": 1.4287836583101208, + "grad_norm": 1.5938472747802734, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8816181421279907, + "num_tokens": 280561307.0, + "step": 7694 + }, + { + "epoch": 1.4289693593314763, + "grad_norm": 1.6030068397521973, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8628394603729248, + "num_tokens": 280598282.0, + "step": 7695 + }, + { + "epoch": 1.429155060352832, + "grad_norm": 1.65445077419281, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8846588134765625, + "num_tokens": 280629175.0, + "step": 7696 + }, + { + "epoch": 1.4293407613741875, + "grad_norm": 1.5512558221817017, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8731402158737183, + "num_tokens": 280664516.0, + "step": 7697 + }, + { + "epoch": 1.4295264623955433, + "grad_norm": 1.6608428955078125, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8504440784454346, + "num_tokens": 280699585.0, + "step": 7698 + }, + { + "epoch": 1.4297121634168988, + "grad_norm": 2.0689759254455566, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8749589920043945, + "num_tokens": 280733136.0, + "step": 7699 + }, + { + "epoch": 1.4298978644382543, + "grad_norm": 1.5721969604492188, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8718513250350952, + "num_tokens": 280767636.0, + "step": 7700 + }, + { + "epoch": 1.43008356545961, + "grad_norm": 1.6307512521743774, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8566396236419678, + "num_tokens": 280805169.0, + "step": 7701 + }, + { + "epoch": 1.4302692664809658, + "grad_norm": 1.6277066469192505, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8686787486076355, + "num_tokens": 280839936.0, + "step": 7702 + }, + { + "epoch": 1.4304549675023213, + "grad_norm": 1.6239471435546875, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8827618360519409, + "num_tokens": 280875422.0, + "step": 7703 + }, + { + "epoch": 1.4306406685236768, + "grad_norm": 1.6067878007888794, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.87516188621521, + "num_tokens": 280912600.0, + "step": 7704 + }, + { + "epoch": 1.4308263695450325, + "grad_norm": 1.6308772563934326, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8718323707580566, + "num_tokens": 280946053.0, + "step": 7705 + }, + { + "epoch": 1.4310120705663882, + "grad_norm": 1.4782798290252686, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8783652186393738, + "num_tokens": 280981882.0, + "step": 7706 + }, + { + "epoch": 1.4311977715877438, + "grad_norm": 1.708787202835083, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8906667232513428, + "num_tokens": 281014766.0, + "step": 7707 + }, + { + "epoch": 1.4313834726090993, + "grad_norm": 1.6837518215179443, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8635848760604858, + "num_tokens": 281047838.0, + "step": 7708 + }, + { + "epoch": 1.431569173630455, + "grad_norm": 1.5622165203094482, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.881052553653717, + "num_tokens": 281085073.0, + "step": 7709 + }, + { + "epoch": 1.4317548746518105, + "grad_norm": 1.689882755279541, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8745399117469788, + "num_tokens": 281115613.0, + "step": 7710 + }, + { + "epoch": 1.4319405756731662, + "grad_norm": 1.4476656913757324, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8802100419998169, + "num_tokens": 281154208.0, + "step": 7711 + }, + { + "epoch": 1.4321262766945217, + "grad_norm": 1.3850675821304321, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8910446166992188, + "num_tokens": 281195170.0, + "step": 7712 + }, + { + "epoch": 1.4323119777158775, + "grad_norm": 1.4785138368606567, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.881303608417511, + "num_tokens": 281234298.0, + "step": 7713 + }, + { + "epoch": 1.432497678737233, + "grad_norm": 1.494370698928833, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8955663442611694, + "num_tokens": 281268510.0, + "step": 7714 + }, + { + "epoch": 1.4326833797585887, + "grad_norm": 1.6285393238067627, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8657593727111816, + "num_tokens": 281307301.0, + "step": 7715 + }, + { + "epoch": 1.4328690807799442, + "grad_norm": 1.5972293615341187, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8647386431694031, + "num_tokens": 281344802.0, + "step": 7716 + }, + { + "epoch": 1.4330547818013, + "grad_norm": 1.5742559432983398, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8673133850097656, + "num_tokens": 281384218.0, + "step": 7717 + }, + { + "epoch": 1.4332404828226555, + "grad_norm": 1.594428300857544, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8886518478393555, + "num_tokens": 281419228.0, + "step": 7718 + }, + { + "epoch": 1.4334261838440112, + "grad_norm": 1.5672645568847656, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8647541999816895, + "num_tokens": 281458602.0, + "step": 7719 + }, + { + "epoch": 1.4336118848653667, + "grad_norm": 1.5487151145935059, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8839205503463745, + "num_tokens": 281493423.0, + "step": 7720 + }, + { + "epoch": 1.4337975858867225, + "grad_norm": 1.5389314889907837, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8778655529022217, + "num_tokens": 281529896.0, + "step": 7721 + }, + { + "epoch": 1.433983286908078, + "grad_norm": 1.851313591003418, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8814769387245178, + "num_tokens": 281556185.0, + "step": 7722 + }, + { + "epoch": 1.4341689879294335, + "grad_norm": 1.5058146715164185, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8871099352836609, + "num_tokens": 281596948.0, + "step": 7723 + }, + { + "epoch": 1.4343546889507892, + "grad_norm": 1.821001410484314, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8784581422805786, + "num_tokens": 281629723.0, + "step": 7724 + }, + { + "epoch": 1.434540389972145, + "grad_norm": 1.5868422985076904, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8677501678466797, + "num_tokens": 281667340.0, + "step": 7725 + }, + { + "epoch": 1.4347260909935005, + "grad_norm": 1.59817373752594, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8781644105911255, + "num_tokens": 281701781.0, + "step": 7726 + }, + { + "epoch": 1.434911792014856, + "grad_norm": 1.5444992780685425, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8778688907623291, + "num_tokens": 281739125.0, + "step": 7727 + }, + { + "epoch": 1.4350974930362117, + "grad_norm": 1.6196401119232178, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8787547945976257, + "num_tokens": 281773693.0, + "step": 7728 + }, + { + "epoch": 1.4352831940575674, + "grad_norm": 1.507140040397644, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8860678672790527, + "num_tokens": 281809441.0, + "step": 7729 + }, + { + "epoch": 1.435468895078923, + "grad_norm": 1.6608504056930542, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8744805455207825, + "num_tokens": 281843207.0, + "step": 7730 + }, + { + "epoch": 1.4356545961002785, + "grad_norm": 1.6058664321899414, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8758888244628906, + "num_tokens": 281877533.0, + "step": 7731 + }, + { + "epoch": 1.4358402971216342, + "grad_norm": 1.6105246543884277, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8772973418235779, + "num_tokens": 281910372.0, + "step": 7732 + }, + { + "epoch": 1.43602599814299, + "grad_norm": 1.4982246160507202, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8693909645080566, + "num_tokens": 281954312.0, + "step": 7733 + }, + { + "epoch": 1.4362116991643454, + "grad_norm": 1.6453227996826172, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8795789480209351, + "num_tokens": 281985886.0, + "step": 7734 + }, + { + "epoch": 1.436397400185701, + "grad_norm": 1.4942446947097778, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8742650747299194, + "num_tokens": 282021937.0, + "step": 7735 + }, + { + "epoch": 1.4365831012070567, + "grad_norm": 1.5485711097717285, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8852427005767822, + "num_tokens": 282055252.0, + "step": 7736 + }, + { + "epoch": 1.4367688022284122, + "grad_norm": 1.5633827447891235, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8764970898628235, + "num_tokens": 282092243.0, + "step": 7737 + }, + { + "epoch": 1.436954503249768, + "grad_norm": 1.4849843978881836, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8746477961540222, + "num_tokens": 282130634.0, + "step": 7738 + }, + { + "epoch": 1.4371402042711234, + "grad_norm": 1.5806294679641724, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8738194704055786, + "num_tokens": 282167941.0, + "step": 7739 + }, + { + "epoch": 1.4373259052924792, + "grad_norm": 1.4213706254959106, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8766252994537354, + "num_tokens": 282210638.0, + "step": 7740 + }, + { + "epoch": 1.4375116063138347, + "grad_norm": 1.544192910194397, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8594900965690613, + "num_tokens": 282252827.0, + "step": 7741 + }, + { + "epoch": 1.4376973073351904, + "grad_norm": 1.5450125932693481, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8654932975769043, + "num_tokens": 282294377.0, + "step": 7742 + }, + { + "epoch": 1.437883008356546, + "grad_norm": 1.5836408138275146, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8611900806427002, + "num_tokens": 282332908.0, + "step": 7743 + }, + { + "epoch": 1.4380687093779017, + "grad_norm": 1.582837700843811, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8704190254211426, + "num_tokens": 282370714.0, + "step": 7744 + }, + { + "epoch": 1.4382544103992572, + "grad_norm": 1.567902684211731, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8754538893699646, + "num_tokens": 282408788.0, + "step": 7745 + }, + { + "epoch": 1.4384401114206127, + "grad_norm": 1.6167523860931396, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8847585916519165, + "num_tokens": 282444326.0, + "step": 7746 + }, + { + "epoch": 1.4386258124419684, + "grad_norm": 1.5971657037734985, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8615373373031616, + "num_tokens": 282479195.0, + "step": 7747 + }, + { + "epoch": 1.4388115134633241, + "grad_norm": 1.5196031332015991, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8802011609077454, + "num_tokens": 282515634.0, + "step": 7748 + }, + { + "epoch": 1.4389972144846797, + "grad_norm": 1.603676438331604, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8798580765724182, + "num_tokens": 282550650.0, + "step": 7749 + }, + { + "epoch": 1.4391829155060352, + "grad_norm": 1.4997293949127197, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8848920464515686, + "num_tokens": 282585520.0, + "step": 7750 + }, + { + "epoch": 1.439368616527391, + "grad_norm": 1.5387651920318604, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8890928030014038, + "num_tokens": 282621691.0, + "step": 7751 + }, + { + "epoch": 1.4395543175487466, + "grad_norm": 1.5775209665298462, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.884474515914917, + "num_tokens": 282656485.0, + "step": 7752 + }, + { + "epoch": 1.4397400185701021, + "grad_norm": 1.546950101852417, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8888407945632935, + "num_tokens": 282694070.0, + "step": 7753 + }, + { + "epoch": 1.4399257195914577, + "grad_norm": 1.7088395357131958, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8598498106002808, + "num_tokens": 282729591.0, + "step": 7754 + }, + { + "epoch": 1.4401114206128134, + "grad_norm": 1.5314937829971313, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8819003105163574, + "num_tokens": 282765283.0, + "step": 7755 + }, + { + "epoch": 1.4402971216341691, + "grad_norm": 1.56644868850708, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8707703351974487, + "num_tokens": 282802415.0, + "step": 7756 + }, + { + "epoch": 1.4404828226555246, + "grad_norm": 1.4472603797912598, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8734831213951111, + "num_tokens": 282841270.0, + "step": 7757 + }, + { + "epoch": 1.4406685236768801, + "grad_norm": 1.7350895404815674, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8847718834877014, + "num_tokens": 282875480.0, + "step": 7758 + }, + { + "epoch": 1.4408542246982359, + "grad_norm": 1.4445745944976807, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8854227066040039, + "num_tokens": 282915367.0, + "step": 7759 + }, + { + "epoch": 1.4410399257195914, + "grad_norm": 1.4985103607177734, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8802337646484375, + "num_tokens": 282951592.0, + "step": 7760 + }, + { + "epoch": 1.4412256267409471, + "grad_norm": 1.379207968711853, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.889563798904419, + "num_tokens": 282991639.0, + "step": 7761 + }, + { + "epoch": 1.4414113277623026, + "grad_norm": 1.5857651233673096, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8756153583526611, + "num_tokens": 283029120.0, + "step": 7762 + }, + { + "epoch": 1.4415970287836584, + "grad_norm": 1.7097816467285156, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8611449003219604, + "num_tokens": 283063372.0, + "step": 7763 + }, + { + "epoch": 1.4417827298050139, + "grad_norm": 1.622064471244812, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8863388299942017, + "num_tokens": 283099057.0, + "step": 7764 + }, + { + "epoch": 1.4419684308263696, + "grad_norm": 1.6665050983428955, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8715049624443054, + "num_tokens": 283132243.0, + "step": 7765 + }, + { + "epoch": 1.4421541318477251, + "grad_norm": 1.5949747562408447, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8739005327224731, + "num_tokens": 283167887.0, + "step": 7766 + }, + { + "epoch": 1.4423398328690809, + "grad_norm": 1.747422695159912, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8691592812538147, + "num_tokens": 283203025.0, + "step": 7767 + }, + { + "epoch": 1.4425255338904364, + "grad_norm": 1.6937322616577148, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8631728291511536, + "num_tokens": 283239801.0, + "step": 7768 + }, + { + "epoch": 1.442711234911792, + "grad_norm": 1.6484060287475586, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8783978223800659, + "num_tokens": 283271907.0, + "step": 7769 + }, + { + "epoch": 1.4428969359331476, + "grad_norm": 1.6312421560287476, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8801387548446655, + "num_tokens": 283304817.0, + "step": 7770 + }, + { + "epoch": 1.4430826369545033, + "grad_norm": 1.5355862379074097, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8710265755653381, + "num_tokens": 283340753.0, + "step": 7771 + }, + { + "epoch": 1.4432683379758589, + "grad_norm": 1.4932318925857544, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8859795331954956, + "num_tokens": 283379112.0, + "step": 7772 + }, + { + "epoch": 1.4434540389972144, + "grad_norm": 1.5778181552886963, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.887323260307312, + "num_tokens": 283415896.0, + "step": 7773 + }, + { + "epoch": 1.44363974001857, + "grad_norm": 1.6757595539093018, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8803730607032776, + "num_tokens": 283455262.0, + "step": 7774 + }, + { + "epoch": 1.4438254410399258, + "grad_norm": 1.6625655889511108, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8744921684265137, + "num_tokens": 283489848.0, + "step": 7775 + }, + { + "epoch": 1.4440111420612813, + "grad_norm": 1.4388571977615356, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8832231163978577, + "num_tokens": 283528769.0, + "step": 7776 + }, + { + "epoch": 1.4441968430826368, + "grad_norm": 1.62434720993042, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8936272859573364, + "num_tokens": 283559444.0, + "step": 7777 + }, + { + "epoch": 1.4443825441039926, + "grad_norm": 1.6165450811386108, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8686401844024658, + "num_tokens": 283594624.0, + "step": 7778 + }, + { + "epoch": 1.4445682451253483, + "grad_norm": 1.5299417972564697, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8961743116378784, + "num_tokens": 283627267.0, + "step": 7779 + }, + { + "epoch": 1.4447539461467038, + "grad_norm": 1.596958875656128, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8784025311470032, + "num_tokens": 283663785.0, + "step": 7780 + }, + { + "epoch": 1.4449396471680593, + "grad_norm": 1.6251782178878784, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8819115161895752, + "num_tokens": 283700492.0, + "step": 7781 + }, + { + "epoch": 1.445125348189415, + "grad_norm": 1.4630322456359863, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8749032616615295, + "num_tokens": 283738094.0, + "step": 7782 + }, + { + "epoch": 1.4453110492107706, + "grad_norm": 1.5545272827148438, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8681191205978394, + "num_tokens": 283777863.0, + "step": 7783 + }, + { + "epoch": 1.4454967502321263, + "grad_norm": 1.5576483011245728, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8734534978866577, + "num_tokens": 283812596.0, + "step": 7784 + }, + { + "epoch": 1.4456824512534818, + "grad_norm": 1.559791088104248, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8713942766189575, + "num_tokens": 283848111.0, + "step": 7785 + }, + { + "epoch": 1.4458681522748376, + "grad_norm": 1.6318432092666626, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8930999040603638, + "num_tokens": 283880181.0, + "step": 7786 + }, + { + "epoch": 1.446053853296193, + "grad_norm": 1.5586953163146973, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8738675713539124, + "num_tokens": 283915332.0, + "step": 7787 + }, + { + "epoch": 1.4462395543175488, + "grad_norm": 1.7008346319198608, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8808486461639404, + "num_tokens": 283943030.0, + "step": 7788 + }, + { + "epoch": 1.4464252553389043, + "grad_norm": 1.4999356269836426, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8711831569671631, + "num_tokens": 283981415.0, + "step": 7789 + }, + { + "epoch": 1.44661095636026, + "grad_norm": 1.5766417980194092, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.875917375087738, + "num_tokens": 284015881.0, + "step": 7790 + }, + { + "epoch": 1.4467966573816156, + "grad_norm": 1.6466532945632935, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.871961236000061, + "num_tokens": 284053112.0, + "step": 7791 + }, + { + "epoch": 1.4469823584029713, + "grad_norm": 1.6856911182403564, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8658440113067627, + "num_tokens": 284085740.0, + "step": 7792 + }, + { + "epoch": 1.4471680594243268, + "grad_norm": 1.5331449508666992, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8598350882530212, + "num_tokens": 284125914.0, + "step": 7793 + }, + { + "epoch": 1.4473537604456825, + "grad_norm": 1.5507445335388184, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8681268692016602, + "num_tokens": 284164119.0, + "step": 7794 + }, + { + "epoch": 1.447539461467038, + "grad_norm": 1.3645200729370117, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8892822265625, + "num_tokens": 284205605.0, + "step": 7795 + }, + { + "epoch": 1.4477251624883936, + "grad_norm": 1.5597258806228638, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.865813672542572, + "num_tokens": 284240897.0, + "step": 7796 + }, + { + "epoch": 1.4479108635097493, + "grad_norm": 1.4842978715896606, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.870857298374176, + "num_tokens": 284280128.0, + "step": 7797 + }, + { + "epoch": 1.448096564531105, + "grad_norm": 1.4645799398422241, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8819600343704224, + "num_tokens": 284315961.0, + "step": 7798 + }, + { + "epoch": 1.4482822655524605, + "grad_norm": 1.5973173379898071, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8648574352264404, + "num_tokens": 284353818.0, + "step": 7799 + }, + { + "epoch": 1.448467966573816, + "grad_norm": 1.5017720460891724, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8804402351379395, + "num_tokens": 284390113.0, + "step": 7800 + }, + { + "epoch": 1.4486536675951718, + "grad_norm": 1.6051214933395386, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8777121305465698, + "num_tokens": 284423783.0, + "step": 7801 + }, + { + "epoch": 1.4488393686165275, + "grad_norm": 1.38877534866333, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.875267744064331, + "num_tokens": 284466591.0, + "step": 7802 + }, + { + "epoch": 1.449025069637883, + "grad_norm": 1.5336084365844727, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8672676086425781, + "num_tokens": 284504369.0, + "step": 7803 + }, + { + "epoch": 1.4492107706592385, + "grad_norm": 1.7780780792236328, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8702907562255859, + "num_tokens": 284540193.0, + "step": 7804 + }, + { + "epoch": 1.4493964716805943, + "grad_norm": 1.4796253442764282, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8830845355987549, + "num_tokens": 284576253.0, + "step": 7805 + }, + { + "epoch": 1.4495821727019498, + "grad_norm": 1.6028037071228027, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8800508379936218, + "num_tokens": 284608818.0, + "step": 7806 + }, + { + "epoch": 1.4497678737233055, + "grad_norm": 1.6267890930175781, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8601938486099243, + "num_tokens": 284645034.0, + "step": 7807 + }, + { + "epoch": 1.449953574744661, + "grad_norm": 1.6030935049057007, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8799450397491455, + "num_tokens": 284682634.0, + "step": 7808 + }, + { + "epoch": 1.4501392757660168, + "grad_norm": 1.6474472284317017, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8814109563827515, + "num_tokens": 284717012.0, + "step": 7809 + }, + { + "epoch": 1.4503249767873723, + "grad_norm": 1.5647268295288086, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.887826681137085, + "num_tokens": 284750746.0, + "step": 7810 + }, + { + "epoch": 1.450510677808728, + "grad_norm": 1.4349093437194824, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8677176833152771, + "num_tokens": 284794558.0, + "step": 7811 + }, + { + "epoch": 1.4506963788300835, + "grad_norm": 1.4636086225509644, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8705899715423584, + "num_tokens": 284838861.0, + "step": 7812 + }, + { + "epoch": 1.4508820798514392, + "grad_norm": 1.5194777250289917, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8681197166442871, + "num_tokens": 284877659.0, + "step": 7813 + }, + { + "epoch": 1.4510677808727948, + "grad_norm": 1.6205065250396729, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8624106645584106, + "num_tokens": 284917300.0, + "step": 7814 + }, + { + "epoch": 1.4512534818941505, + "grad_norm": 1.5184255838394165, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.883722186088562, + "num_tokens": 284954598.0, + "step": 7815 + }, + { + "epoch": 1.451439182915506, + "grad_norm": 1.699042558670044, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8790859580039978, + "num_tokens": 284983825.0, + "step": 7816 + }, + { + "epoch": 1.4516248839368617, + "grad_norm": 1.5846025943756104, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8637077212333679, + "num_tokens": 285018642.0, + "step": 7817 + }, + { + "epoch": 1.4518105849582172, + "grad_norm": 1.5163207054138184, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8803625106811523, + "num_tokens": 285055845.0, + "step": 7818 + }, + { + "epoch": 1.4519962859795728, + "grad_norm": 1.4691141843795776, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8630948662757874, + "num_tokens": 285098101.0, + "step": 7819 + }, + { + "epoch": 1.4521819870009285, + "grad_norm": 1.5136561393737793, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8579871654510498, + "num_tokens": 285138141.0, + "step": 7820 + }, + { + "epoch": 1.4523676880222842, + "grad_norm": 1.5911288261413574, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8720999956130981, + "num_tokens": 285170939.0, + "step": 7821 + }, + { + "epoch": 1.4525533890436397, + "grad_norm": 1.632529616355896, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8683938384056091, + "num_tokens": 285205199.0, + "step": 7822 + }, + { + "epoch": 1.4527390900649952, + "grad_norm": 1.7226237058639526, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8612834811210632, + "num_tokens": 285238455.0, + "step": 7823 + }, + { + "epoch": 1.452924791086351, + "grad_norm": 1.459557056427002, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.876863956451416, + "num_tokens": 285277602.0, + "step": 7824 + }, + { + "epoch": 1.4531104921077067, + "grad_norm": 1.3726521730422974, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8867571949958801, + "num_tokens": 285320772.0, + "step": 7825 + }, + { + "epoch": 1.4532961931290622, + "grad_norm": 1.6028673648834229, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8712496757507324, + "num_tokens": 285354348.0, + "step": 7826 + }, + { + "epoch": 1.4534818941504177, + "grad_norm": 1.566833257675171, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8858871459960938, + "num_tokens": 285389388.0, + "step": 7827 + }, + { + "epoch": 1.4536675951717735, + "grad_norm": 1.6135629415512085, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8854663968086243, + "num_tokens": 285423662.0, + "step": 7828 + }, + { + "epoch": 1.4538532961931292, + "grad_norm": 1.4308950901031494, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8816611766815186, + "num_tokens": 285464300.0, + "step": 7829 + }, + { + "epoch": 1.4540389972144847, + "grad_norm": 1.5259172916412354, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8694581985473633, + "num_tokens": 285505925.0, + "step": 7830 + }, + { + "epoch": 1.4542246982358402, + "grad_norm": 1.3846005201339722, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8930290341377258, + "num_tokens": 285543957.0, + "step": 7831 + }, + { + "epoch": 1.454410399257196, + "grad_norm": 1.8968117237091064, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8691537380218506, + "num_tokens": 285582051.0, + "step": 7832 + }, + { + "epoch": 1.4545961002785515, + "grad_norm": 1.4252978563308716, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8843709826469421, + "num_tokens": 285624651.0, + "step": 7833 + }, + { + "epoch": 1.4547818012999072, + "grad_norm": 1.510061264038086, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8605484962463379, + "num_tokens": 285667619.0, + "step": 7834 + }, + { + "epoch": 1.4549675023212627, + "grad_norm": 1.5044256448745728, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8758643865585327, + "num_tokens": 285704338.0, + "step": 7835 + }, + { + "epoch": 1.4551532033426184, + "grad_norm": 1.6858627796173096, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.878726065158844, + "num_tokens": 285737786.0, + "step": 7836 + }, + { + "epoch": 1.455338904363974, + "grad_norm": 1.497861623764038, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8848252296447754, + "num_tokens": 285774579.0, + "step": 7837 + }, + { + "epoch": 1.4555246053853297, + "grad_norm": 1.4664199352264404, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8870915770530701, + "num_tokens": 285811552.0, + "step": 7838 + }, + { + "epoch": 1.4557103064066852, + "grad_norm": 1.6112477779388428, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8637773394584656, + "num_tokens": 285849947.0, + "step": 7839 + }, + { + "epoch": 1.455896007428041, + "grad_norm": 1.4907218217849731, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8769638538360596, + "num_tokens": 285887480.0, + "step": 7840 + }, + { + "epoch": 1.4560817084493964, + "grad_norm": 1.6091816425323486, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8736528754234314, + "num_tokens": 285923532.0, + "step": 7841 + }, + { + "epoch": 1.4562674094707522, + "grad_norm": 1.44418466091156, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8816430568695068, + "num_tokens": 285963366.0, + "step": 7842 + }, + { + "epoch": 1.4564531104921077, + "grad_norm": 1.409084439277649, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8830621242523193, + "num_tokens": 286005539.0, + "step": 7843 + }, + { + "epoch": 1.4566388115134634, + "grad_norm": 1.6757550239562988, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8733583688735962, + "num_tokens": 286047851.0, + "step": 7844 + }, + { + "epoch": 1.456824512534819, + "grad_norm": 1.4921363592147827, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8809016942977905, + "num_tokens": 286086433.0, + "step": 7845 + }, + { + "epoch": 1.4570102135561744, + "grad_norm": 1.5432283878326416, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8836961984634399, + "num_tokens": 286124302.0, + "step": 7846 + }, + { + "epoch": 1.4571959145775302, + "grad_norm": 1.598204493522644, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8821226358413696, + "num_tokens": 286160811.0, + "step": 7847 + }, + { + "epoch": 1.457381615598886, + "grad_norm": 1.6601804494857788, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8669704794883728, + "num_tokens": 286196393.0, + "step": 7848 + }, + { + "epoch": 1.4575673166202414, + "grad_norm": 1.5410465002059937, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8770890235900879, + "num_tokens": 286230201.0, + "step": 7849 + }, + { + "epoch": 1.457753017641597, + "grad_norm": 1.4776256084442139, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8862484693527222, + "num_tokens": 286267362.0, + "step": 7850 + }, + { + "epoch": 1.4579387186629527, + "grad_norm": 1.4024667739868164, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8906549215316772, + "num_tokens": 286304850.0, + "step": 7851 + }, + { + "epoch": 1.4581244196843084, + "grad_norm": 1.6392760276794434, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8763343691825867, + "num_tokens": 286337640.0, + "step": 7852 + }, + { + "epoch": 1.458310120705664, + "grad_norm": 1.560500979423523, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8714091181755066, + "num_tokens": 286374568.0, + "step": 7853 + }, + { + "epoch": 1.4584958217270194, + "grad_norm": 1.5742286443710327, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8762781023979187, + "num_tokens": 286410530.0, + "step": 7854 + }, + { + "epoch": 1.4586815227483751, + "grad_norm": 1.4739220142364502, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8756842613220215, + "num_tokens": 286448251.0, + "step": 7855 + }, + { + "epoch": 1.4588672237697307, + "grad_norm": 1.4956499338150024, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8840050101280212, + "num_tokens": 286484394.0, + "step": 7856 + }, + { + "epoch": 1.4590529247910864, + "grad_norm": 1.5427321195602417, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8855206370353699, + "num_tokens": 286518357.0, + "step": 7857 + }, + { + "epoch": 1.459238625812442, + "grad_norm": 1.647546648979187, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8736376762390137, + "num_tokens": 286548713.0, + "step": 7858 + }, + { + "epoch": 1.4594243268337976, + "grad_norm": 1.6437220573425293, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8841018676757812, + "num_tokens": 286580529.0, + "step": 7859 + }, + { + "epoch": 1.4596100278551531, + "grad_norm": 1.5398107767105103, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8745694756507874, + "num_tokens": 286617800.0, + "step": 7860 + }, + { + "epoch": 1.4597957288765089, + "grad_norm": 1.5243507623672485, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8856232166290283, + "num_tokens": 286651297.0, + "step": 7861 + }, + { + "epoch": 1.4599814298978644, + "grad_norm": 1.5156474113464355, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8800786733627319, + "num_tokens": 286686441.0, + "step": 7862 + }, + { + "epoch": 1.4601671309192201, + "grad_norm": 1.5304192304611206, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8787838220596313, + "num_tokens": 286724777.0, + "step": 7863 + }, + { + "epoch": 1.4603528319405756, + "grad_norm": 1.4450242519378662, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8955197334289551, + "num_tokens": 286761965.0, + "step": 7864 + }, + { + "epoch": 1.4605385329619314, + "grad_norm": 1.7975437641143799, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8694043159484863, + "num_tokens": 286792058.0, + "step": 7865 + }, + { + "epoch": 1.4607242339832869, + "grad_norm": 1.4000133275985718, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8958661556243896, + "num_tokens": 286831608.0, + "step": 7866 + }, + { + "epoch": 1.4609099350046426, + "grad_norm": 1.570237159729004, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8700757622718811, + "num_tokens": 286868444.0, + "step": 7867 + }, + { + "epoch": 1.4610956360259981, + "grad_norm": 1.484086513519287, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8814930319786072, + "num_tokens": 286906441.0, + "step": 7868 + }, + { + "epoch": 1.4612813370473536, + "grad_norm": 1.5733895301818848, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8750301599502563, + "num_tokens": 286944940.0, + "step": 7869 + }, + { + "epoch": 1.4614670380687094, + "grad_norm": 1.563011646270752, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8865721225738525, + "num_tokens": 286979352.0, + "step": 7870 + }, + { + "epoch": 1.461652739090065, + "grad_norm": 1.5718653202056885, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8887990713119507, + "num_tokens": 287012515.0, + "step": 7871 + }, + { + "epoch": 1.4618384401114206, + "grad_norm": 1.4551868438720703, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8916780948638916, + "num_tokens": 287050447.0, + "step": 7872 + }, + { + "epoch": 1.4620241411327761, + "grad_norm": 1.646492838859558, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8874716758728027, + "num_tokens": 287081490.0, + "step": 7873 + }, + { + "epoch": 1.4622098421541319, + "grad_norm": 1.4965330362319946, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8797482252120972, + "num_tokens": 287120093.0, + "step": 7874 + }, + { + "epoch": 1.4623955431754876, + "grad_norm": 1.5945870876312256, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8815709948539734, + "num_tokens": 287153339.0, + "step": 7875 + }, + { + "epoch": 1.462581244196843, + "grad_norm": 1.5821155309677124, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8892174363136292, + "num_tokens": 287183484.0, + "step": 7876 + }, + { + "epoch": 1.4627669452181986, + "grad_norm": 1.5423884391784668, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8716686964035034, + "num_tokens": 287224630.0, + "step": 7877 + }, + { + "epoch": 1.4629526462395543, + "grad_norm": 1.4771193265914917, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8740438222885132, + "num_tokens": 287263094.0, + "step": 7878 + }, + { + "epoch": 1.4631383472609099, + "grad_norm": 1.4020243883132935, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8743302822113037, + "num_tokens": 287304383.0, + "step": 7879 + }, + { + "epoch": 1.4633240482822656, + "grad_norm": 1.6733441352844238, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8663175106048584, + "num_tokens": 287338440.0, + "step": 7880 + }, + { + "epoch": 1.463509749303621, + "grad_norm": 1.4760602712631226, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.890889048576355, + "num_tokens": 287375379.0, + "step": 7881 + }, + { + "epoch": 1.4636954503249768, + "grad_norm": 1.785649061203003, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8814133405685425, + "num_tokens": 287405714.0, + "step": 7882 + }, + { + "epoch": 1.4638811513463323, + "grad_norm": 1.4340152740478516, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8893024325370789, + "num_tokens": 287443523.0, + "step": 7883 + }, + { + "epoch": 1.464066852367688, + "grad_norm": 1.4985929727554321, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8784153461456299, + "num_tokens": 287481322.0, + "step": 7884 + }, + { + "epoch": 1.4642525533890436, + "grad_norm": 1.5978554487228394, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8767000436782837, + "num_tokens": 287516606.0, + "step": 7885 + }, + { + "epoch": 1.4644382544103993, + "grad_norm": 1.6483774185180664, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8822623491287231, + "num_tokens": 287547172.0, + "step": 7886 + }, + { + "epoch": 1.4646239554317548, + "grad_norm": 2.0075161457061768, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8843019008636475, + "num_tokens": 287582576.0, + "step": 7887 + }, + { + "epoch": 1.4648096564531106, + "grad_norm": 1.4911834001541138, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8828597068786621, + "num_tokens": 287622337.0, + "step": 7888 + }, + { + "epoch": 1.464995357474466, + "grad_norm": 1.5963470935821533, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8838145136833191, + "num_tokens": 287656223.0, + "step": 7889 + }, + { + "epoch": 1.4651810584958218, + "grad_norm": 1.4979318380355835, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8895865678787231, + "num_tokens": 287692797.0, + "step": 7890 + }, + { + "epoch": 1.4653667595171773, + "grad_norm": 1.505012035369873, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.878020703792572, + "num_tokens": 287726099.0, + "step": 7891 + }, + { + "epoch": 1.4655524605385328, + "grad_norm": 1.5327062606811523, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8879210352897644, + "num_tokens": 287759062.0, + "step": 7892 + }, + { + "epoch": 1.4657381615598886, + "grad_norm": 1.6048022508621216, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8900406360626221, + "num_tokens": 287792725.0, + "step": 7893 + }, + { + "epoch": 1.4659238625812443, + "grad_norm": 1.6770613193511963, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8977050185203552, + "num_tokens": 287821360.0, + "step": 7894 + }, + { + "epoch": 1.4661095636025998, + "grad_norm": 1.493623971939087, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8878998160362244, + "num_tokens": 287854737.0, + "step": 7895 + }, + { + "epoch": 1.4662952646239553, + "grad_norm": 1.6663700342178345, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8477837443351746, + "num_tokens": 287890810.0, + "step": 7896 + }, + { + "epoch": 1.466480965645311, + "grad_norm": 1.6592351198196411, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8809570074081421, + "num_tokens": 287924006.0, + "step": 7897 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.4658786058425903, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8697227239608765, + "num_tokens": 287964254.0, + "step": 7898 + }, + { + "epoch": 1.4668523676880223, + "grad_norm": 1.4796446561813354, + "learning_rate": 1e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.8985731601715088, + "num_tokens": 287998517.0, + "step": 7899 + }, + { + "epoch": 1.4670380687093778, + "grad_norm": 1.49129319190979, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8838576078414917, + "num_tokens": 288037741.0, + "step": 7900 + }, + { + "epoch": 1.4672237697307335, + "grad_norm": 1.5299495458602905, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8850057125091553, + "num_tokens": 288071934.0, + "step": 7901 + }, + { + "epoch": 1.4674094707520893, + "grad_norm": 1.5455094575881958, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8713210225105286, + "num_tokens": 288106898.0, + "step": 7902 + }, + { + "epoch": 1.4675951717734448, + "grad_norm": 1.5084701776504517, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8751829266548157, + "num_tokens": 288144754.0, + "step": 7903 + }, + { + "epoch": 1.4677808727948003, + "grad_norm": 1.5546495914459229, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8802660703659058, + "num_tokens": 288182305.0, + "step": 7904 + }, + { + "epoch": 1.467966573816156, + "grad_norm": 1.624624252319336, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8555338382720947, + "num_tokens": 288222287.0, + "step": 7905 + }, + { + "epoch": 1.4681522748375115, + "grad_norm": 1.4730207920074463, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.9007076025009155, + "num_tokens": 288255429.0, + "step": 7906 + }, + { + "epoch": 1.4683379758588673, + "grad_norm": 1.8531663417816162, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.862421989440918, + "num_tokens": 288288410.0, + "step": 7907 + }, + { + "epoch": 1.4685236768802228, + "grad_norm": 1.6086764335632324, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8840894103050232, + "num_tokens": 288319102.0, + "step": 7908 + }, + { + "epoch": 1.4687093779015785, + "grad_norm": 1.4809720516204834, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8750305771827698, + "num_tokens": 288361975.0, + "step": 7909 + }, + { + "epoch": 1.468895078922934, + "grad_norm": 1.5705819129943848, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8790661692619324, + "num_tokens": 288399019.0, + "step": 7910 + }, + { + "epoch": 1.4690807799442898, + "grad_norm": 1.6102385520935059, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8854027390480042, + "num_tokens": 288428333.0, + "step": 7911 + }, + { + "epoch": 1.4692664809656453, + "grad_norm": 1.5982218980789185, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8721519708633423, + "num_tokens": 288461009.0, + "step": 7912 + }, + { + "epoch": 1.469452181987001, + "grad_norm": 1.6052725315093994, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8821844458580017, + "num_tokens": 288492079.0, + "step": 7913 + }, + { + "epoch": 1.4696378830083565, + "grad_norm": 1.4737927913665771, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8844165205955505, + "num_tokens": 288529623.0, + "step": 7914 + }, + { + "epoch": 1.469823584029712, + "grad_norm": 1.5731329917907715, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.86907958984375, + "num_tokens": 288568704.0, + "step": 7915 + }, + { + "epoch": 1.4700092850510678, + "grad_norm": 1.4893616437911987, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.886069655418396, + "num_tokens": 288610800.0, + "step": 7916 + }, + { + "epoch": 1.4701949860724235, + "grad_norm": 1.544819951057434, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8838512301445007, + "num_tokens": 288646190.0, + "step": 7917 + }, + { + "epoch": 1.470380687093779, + "grad_norm": 1.5792561769485474, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8743605017662048, + "num_tokens": 288679350.0, + "step": 7918 + }, + { + "epoch": 1.4705663881151345, + "grad_norm": 1.5559768676757812, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8895138502120972, + "num_tokens": 288712742.0, + "step": 7919 + }, + { + "epoch": 1.4707520891364902, + "grad_norm": 1.5389882326126099, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8766431212425232, + "num_tokens": 288750334.0, + "step": 7920 + }, + { + "epoch": 1.470937790157846, + "grad_norm": 1.4992012977600098, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.879874587059021, + "num_tokens": 288790655.0, + "step": 7921 + }, + { + "epoch": 1.4711234911792015, + "grad_norm": 1.4793143272399902, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.882160484790802, + "num_tokens": 288829749.0, + "step": 7922 + }, + { + "epoch": 1.471309192200557, + "grad_norm": 1.4115815162658691, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8748093843460083, + "num_tokens": 288872336.0, + "step": 7923 + }, + { + "epoch": 1.4714948932219127, + "grad_norm": 1.4817125797271729, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.877755880355835, + "num_tokens": 288911180.0, + "step": 7924 + }, + { + "epoch": 1.4716805942432685, + "grad_norm": 1.4531188011169434, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8859618902206421, + "num_tokens": 288953137.0, + "step": 7925 + }, + { + "epoch": 1.471866295264624, + "grad_norm": 1.620977759361267, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8861669301986694, + "num_tokens": 288986649.0, + "step": 7926 + }, + { + "epoch": 1.4720519962859795, + "grad_norm": 1.4657177925109863, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8835086226463318, + "num_tokens": 289027108.0, + "step": 7927 + }, + { + "epoch": 1.4722376973073352, + "grad_norm": 1.458344578742981, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8852719664573669, + "num_tokens": 289064046.0, + "step": 7928 + }, + { + "epoch": 1.4724233983286907, + "grad_norm": 1.8427870273590088, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.863882303237915, + "num_tokens": 289093261.0, + "step": 7929 + }, + { + "epoch": 1.4726090993500465, + "grad_norm": 1.5052162408828735, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8767019510269165, + "num_tokens": 289128943.0, + "step": 7930 + }, + { + "epoch": 1.472794800371402, + "grad_norm": 1.5286248922348022, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8775674104690552, + "num_tokens": 289170444.0, + "step": 7931 + }, + { + "epoch": 1.4729805013927577, + "grad_norm": 1.4449678659439087, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8887203335762024, + "num_tokens": 289212289.0, + "step": 7932 + }, + { + "epoch": 1.4731662024141132, + "grad_norm": 1.649444580078125, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8833451271057129, + "num_tokens": 289242215.0, + "step": 7933 + }, + { + "epoch": 1.473351903435469, + "grad_norm": 1.6067748069763184, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8672125339508057, + "num_tokens": 289280135.0, + "step": 7934 + }, + { + "epoch": 1.4735376044568245, + "grad_norm": 1.465285062789917, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8776426315307617, + "num_tokens": 289320635.0, + "step": 7935 + }, + { + "epoch": 1.4737233054781802, + "grad_norm": 1.5258128643035889, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8713592290878296, + "num_tokens": 289357930.0, + "step": 7936 + }, + { + "epoch": 1.4739090064995357, + "grad_norm": 1.4694700241088867, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8800088763237, + "num_tokens": 289397990.0, + "step": 7937 + }, + { + "epoch": 1.4740947075208914, + "grad_norm": 1.7958463430404663, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8745532631874084, + "num_tokens": 289428949.0, + "step": 7938 + }, + { + "epoch": 1.474280408542247, + "grad_norm": 1.5112205743789673, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8707755208015442, + "num_tokens": 289465055.0, + "step": 7939 + }, + { + "epoch": 1.4744661095636027, + "grad_norm": 1.697326898574829, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8689431548118591, + "num_tokens": 289494633.0, + "step": 7940 + }, + { + "epoch": 1.4746518105849582, + "grad_norm": 1.691931962966919, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8764820098876953, + "num_tokens": 289524362.0, + "step": 7941 + }, + { + "epoch": 1.4748375116063137, + "grad_norm": 1.6624962091445923, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.855471134185791, + "num_tokens": 289558684.0, + "step": 7942 + }, + { + "epoch": 1.4750232126276694, + "grad_norm": 1.5568424463272095, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8913652300834656, + "num_tokens": 289591346.0, + "step": 7943 + }, + { + "epoch": 1.4752089136490252, + "grad_norm": 1.5336503982543945, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8671004772186279, + "num_tokens": 289630793.0, + "step": 7944 + }, + { + "epoch": 1.4753946146703807, + "grad_norm": 1.4156254529953003, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8938015699386597, + "num_tokens": 289667534.0, + "step": 7945 + }, + { + "epoch": 1.4755803156917362, + "grad_norm": 1.673484444618225, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8744769096374512, + "num_tokens": 289698697.0, + "step": 7946 + }, + { + "epoch": 1.475766016713092, + "grad_norm": 1.4818596839904785, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8780955672264099, + "num_tokens": 289738451.0, + "step": 7947 + }, + { + "epoch": 1.4759517177344477, + "grad_norm": 1.570249319076538, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8767575025558472, + "num_tokens": 289776863.0, + "step": 7948 + }, + { + "epoch": 1.4761374187558032, + "grad_norm": 1.5378915071487427, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8681782484054565, + "num_tokens": 289814346.0, + "step": 7949 + }, + { + "epoch": 1.4763231197771587, + "grad_norm": 1.6568686962127686, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8757018446922302, + "num_tokens": 289848640.0, + "step": 7950 + }, + { + "epoch": 1.4765088207985144, + "grad_norm": 1.4736990928649902, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8870750665664673, + "num_tokens": 289889165.0, + "step": 7951 + }, + { + "epoch": 1.47669452181987, + "grad_norm": 1.555777907371521, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8767342567443848, + "num_tokens": 289926479.0, + "step": 7952 + }, + { + "epoch": 1.4768802228412257, + "grad_norm": 1.8002443313598633, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8673759698867798, + "num_tokens": 289957539.0, + "step": 7953 + }, + { + "epoch": 1.4770659238625812, + "grad_norm": 1.6100633144378662, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.870031476020813, + "num_tokens": 289995859.0, + "step": 7954 + }, + { + "epoch": 1.477251624883937, + "grad_norm": 1.584733009338379, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8834681510925293, + "num_tokens": 290032546.0, + "step": 7955 + }, + { + "epoch": 1.4774373259052924, + "grad_norm": 1.600935935974121, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8799927830696106, + "num_tokens": 290068731.0, + "step": 7956 + }, + { + "epoch": 1.4776230269266482, + "grad_norm": 1.3939104080200195, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8799633979797363, + "num_tokens": 290110943.0, + "step": 7957 + }, + { + "epoch": 1.4778087279480037, + "grad_norm": 1.5527912378311157, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8770849704742432, + "num_tokens": 290149299.0, + "step": 7958 + }, + { + "epoch": 1.4779944289693594, + "grad_norm": 1.477258563041687, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8779531717300415, + "num_tokens": 290188104.0, + "step": 7959 + }, + { + "epoch": 1.478180129990715, + "grad_norm": 1.6369258165359497, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8563926219940186, + "num_tokens": 290225842.0, + "step": 7960 + }, + { + "epoch": 1.4783658310120706, + "grad_norm": 1.5915549993515015, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8745934963226318, + "num_tokens": 290260108.0, + "step": 7961 + }, + { + "epoch": 1.4785515320334262, + "grad_norm": 1.428665041923523, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8683515787124634, + "num_tokens": 290303875.0, + "step": 7962 + }, + { + "epoch": 1.4787372330547819, + "grad_norm": 1.4671847820281982, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8629308342933655, + "num_tokens": 290349176.0, + "step": 7963 + }, + { + "epoch": 1.4789229340761374, + "grad_norm": 1.5062432289123535, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8612138628959656, + "num_tokens": 290388723.0, + "step": 7964 + }, + { + "epoch": 1.479108635097493, + "grad_norm": 1.5820038318634033, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8883469104766846, + "num_tokens": 290421260.0, + "step": 7965 + }, + { + "epoch": 1.4792943361188486, + "grad_norm": 1.5928592681884766, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8767555952072144, + "num_tokens": 290453943.0, + "step": 7966 + }, + { + "epoch": 1.4794800371402044, + "grad_norm": 1.4321184158325195, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.882883608341217, + "num_tokens": 290495238.0, + "step": 7967 + }, + { + "epoch": 1.4796657381615599, + "grad_norm": 1.6396269798278809, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8693668246269226, + "num_tokens": 290527528.0, + "step": 7968 + }, + { + "epoch": 1.4798514391829154, + "grad_norm": 1.4880114793777466, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8918502926826477, + "num_tokens": 290564077.0, + "step": 7969 + }, + { + "epoch": 1.4800371402042711, + "grad_norm": 1.5215861797332764, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8780772089958191, + "num_tokens": 290603678.0, + "step": 7970 + }, + { + "epoch": 1.4802228412256269, + "grad_norm": 1.5528411865234375, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8732292056083679, + "num_tokens": 290640641.0, + "step": 7971 + }, + { + "epoch": 1.4804085422469824, + "grad_norm": 1.4187146425247192, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8802849650382996, + "num_tokens": 290680935.0, + "step": 7972 + }, + { + "epoch": 1.4805942432683379, + "grad_norm": 1.5590678453445435, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.863793671131134, + "num_tokens": 290718444.0, + "step": 7973 + }, + { + "epoch": 1.4807799442896936, + "grad_norm": 1.5757050514221191, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8685088157653809, + "num_tokens": 290755422.0, + "step": 7974 + }, + { + "epoch": 1.4809656453110491, + "grad_norm": 1.5155950784683228, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8918563723564148, + "num_tokens": 290791537.0, + "step": 7975 + }, + { + "epoch": 1.4811513463324049, + "grad_norm": 1.665191888809204, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8655720949172974, + "num_tokens": 290826334.0, + "step": 7976 + }, + { + "epoch": 1.4813370473537604, + "grad_norm": 1.4840415716171265, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8792198300361633, + "num_tokens": 290867360.0, + "step": 7977 + }, + { + "epoch": 1.481522748375116, + "grad_norm": 1.4627796411514282, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8880057334899902, + "num_tokens": 290905382.0, + "step": 7978 + }, + { + "epoch": 1.4817084493964716, + "grad_norm": 1.6680148839950562, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8861280679702759, + "num_tokens": 290935697.0, + "step": 7979 + }, + { + "epoch": 1.4818941504178273, + "grad_norm": 1.7491635084152222, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8519728779792786, + "num_tokens": 290969967.0, + "step": 7980 + }, + { + "epoch": 1.4820798514391829, + "grad_norm": 1.4314182996749878, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8765393495559692, + "num_tokens": 291012724.0, + "step": 7981 + }, + { + "epoch": 1.4822655524605386, + "grad_norm": 1.472428798675537, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.88353031873703, + "num_tokens": 291049530.0, + "step": 7982 + }, + { + "epoch": 1.482451253481894, + "grad_norm": 1.5100278854370117, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8845600485801697, + "num_tokens": 291086178.0, + "step": 7983 + }, + { + "epoch": 1.4826369545032498, + "grad_norm": 1.6166481971740723, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8793273568153381, + "num_tokens": 291121068.0, + "step": 7984 + }, + { + "epoch": 1.4828226555246053, + "grad_norm": 1.598414421081543, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8698131442070007, + "num_tokens": 291156767.0, + "step": 7985 + }, + { + "epoch": 1.483008356545961, + "grad_norm": 1.4915666580200195, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8876115679740906, + "num_tokens": 291191798.0, + "step": 7986 + }, + { + "epoch": 1.4831940575673166, + "grad_norm": 1.479887843132019, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8745735883712769, + "num_tokens": 291233510.0, + "step": 7987 + }, + { + "epoch": 1.483379758588672, + "grad_norm": 1.5239206552505493, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8801003098487854, + "num_tokens": 291269524.0, + "step": 7988 + }, + { + "epoch": 1.4835654596100278, + "grad_norm": 1.620110034942627, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8836091756820679, + "num_tokens": 291300811.0, + "step": 7989 + }, + { + "epoch": 1.4837511606313836, + "grad_norm": 1.3886457681655884, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8952429294586182, + "num_tokens": 291340230.0, + "step": 7990 + }, + { + "epoch": 1.483936861652739, + "grad_norm": 1.6258548498153687, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8852066993713379, + "num_tokens": 291372624.0, + "step": 7991 + }, + { + "epoch": 1.4841225626740946, + "grad_norm": 1.5302988290786743, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8838528394699097, + "num_tokens": 291408155.0, + "step": 7992 + }, + { + "epoch": 1.4843082636954503, + "grad_norm": 1.70443594455719, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8801578283309937, + "num_tokens": 291438965.0, + "step": 7993 + }, + { + "epoch": 1.484493964716806, + "grad_norm": 1.5321449041366577, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8751373291015625, + "num_tokens": 291477975.0, + "step": 7994 + }, + { + "epoch": 1.4846796657381616, + "grad_norm": 1.6691622734069824, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8823184370994568, + "num_tokens": 291509257.0, + "step": 7995 + }, + { + "epoch": 1.484865366759517, + "grad_norm": 1.6142088174819946, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8817442655563354, + "num_tokens": 291542703.0, + "step": 7996 + }, + { + "epoch": 1.4850510677808728, + "grad_norm": 1.8602147102355957, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8697025775909424, + "num_tokens": 291573177.0, + "step": 7997 + }, + { + "epoch": 1.4852367688022285, + "grad_norm": 1.4220248460769653, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8596410751342773, + "num_tokens": 291618097.0, + "step": 7998 + }, + { + "epoch": 1.485422469823584, + "grad_norm": 1.499617338180542, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.882394552230835, + "num_tokens": 291653725.0, + "step": 7999 + }, + { + "epoch": 1.4856081708449396, + "grad_norm": 1.5662609338760376, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8744519352912903, + "num_tokens": 291692480.0, + "step": 8000 + }, + { + "epoch": 1.4857938718662953, + "grad_norm": 1.5110740661621094, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8825845122337341, + "num_tokens": 291727702.0, + "step": 8001 + }, + { + "epoch": 1.4859795728876508, + "grad_norm": 1.6395847797393799, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8699072599411011, + "num_tokens": 291763224.0, + "step": 8002 + }, + { + "epoch": 1.4861652739090065, + "grad_norm": 1.4430499076843262, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8733345866203308, + "num_tokens": 291804359.0, + "step": 8003 + }, + { + "epoch": 1.486350974930362, + "grad_norm": 1.5090110301971436, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.885647714138031, + "num_tokens": 291839613.0, + "step": 8004 + }, + { + "epoch": 1.4865366759517178, + "grad_norm": 1.4607089757919312, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8922525644302368, + "num_tokens": 291873967.0, + "step": 8005 + }, + { + "epoch": 1.4867223769730733, + "grad_norm": 1.5081814527511597, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.886684775352478, + "num_tokens": 291912632.0, + "step": 8006 + }, + { + "epoch": 1.486908077994429, + "grad_norm": 1.5743727684020996, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8781094551086426, + "num_tokens": 291948220.0, + "step": 8007 + }, + { + "epoch": 1.4870937790157845, + "grad_norm": 1.4558262825012207, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8826045989990234, + "num_tokens": 291988284.0, + "step": 8008 + }, + { + "epoch": 1.4872794800371403, + "grad_norm": 1.6683173179626465, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8742263317108154, + "num_tokens": 292023175.0, + "step": 8009 + }, + { + "epoch": 1.4874651810584958, + "grad_norm": 1.5733610391616821, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8768229484558105, + "num_tokens": 292060007.0, + "step": 8010 + }, + { + "epoch": 1.4876508820798515, + "grad_norm": 1.6940633058547974, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8518346548080444, + "num_tokens": 292094882.0, + "step": 8011 + }, + { + "epoch": 1.487836583101207, + "grad_norm": 1.7378616333007812, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8737455606460571, + "num_tokens": 292126399.0, + "step": 8012 + }, + { + "epoch": 1.4880222841225628, + "grad_norm": 1.476181983947754, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8800184726715088, + "num_tokens": 292167674.0, + "step": 8013 + }, + { + "epoch": 1.4882079851439183, + "grad_norm": 1.4905650615692139, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8821778297424316, + "num_tokens": 292203993.0, + "step": 8014 + }, + { + "epoch": 1.4883936861652738, + "grad_norm": 1.7341010570526123, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8656626343727112, + "num_tokens": 292238064.0, + "step": 8015 + }, + { + "epoch": 1.4885793871866295, + "grad_norm": 1.5931648015975952, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8775539398193359, + "num_tokens": 292272816.0, + "step": 8016 + }, + { + "epoch": 1.4887650882079853, + "grad_norm": 1.5560795068740845, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8628135323524475, + "num_tokens": 292312424.0, + "step": 8017 + }, + { + "epoch": 1.4889507892293408, + "grad_norm": 1.6031259298324585, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8802522420883179, + "num_tokens": 292347927.0, + "step": 8018 + }, + { + "epoch": 1.4891364902506963, + "grad_norm": 1.750789999961853, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8504438400268555, + "num_tokens": 292385638.0, + "step": 8019 + }, + { + "epoch": 1.489322191272052, + "grad_norm": 1.5720789432525635, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8891253471374512, + "num_tokens": 292417820.0, + "step": 8020 + }, + { + "epoch": 1.4895078922934077, + "grad_norm": 1.5134687423706055, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8884283900260925, + "num_tokens": 292454440.0, + "step": 8021 + }, + { + "epoch": 1.4896935933147633, + "grad_norm": 1.6855342388153076, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8784812688827515, + "num_tokens": 292484222.0, + "step": 8022 + }, + { + "epoch": 1.4898792943361188, + "grad_norm": 1.706849217414856, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8800843954086304, + "num_tokens": 292517946.0, + "step": 8023 + }, + { + "epoch": 1.4900649953574745, + "grad_norm": 1.4897758960723877, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8721926212310791, + "num_tokens": 292557773.0, + "step": 8024 + }, + { + "epoch": 1.49025069637883, + "grad_norm": 1.6042407751083374, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8813925981521606, + "num_tokens": 292590529.0, + "step": 8025 + }, + { + "epoch": 1.4904363974001857, + "grad_norm": 1.613667607307434, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8688300848007202, + "num_tokens": 292623003.0, + "step": 8026 + }, + { + "epoch": 1.4906220984215413, + "grad_norm": 1.5508430004119873, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.884605884552002, + "num_tokens": 292660028.0, + "step": 8027 + }, + { + "epoch": 1.490807799442897, + "grad_norm": 1.5708531141281128, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.884729266166687, + "num_tokens": 292694423.0, + "step": 8028 + }, + { + "epoch": 1.4909935004642525, + "grad_norm": 1.5426939725875854, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8846030831336975, + "num_tokens": 292730501.0, + "step": 8029 + }, + { + "epoch": 1.4911792014856082, + "grad_norm": 1.5668340921401978, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8737863302230835, + "num_tokens": 292767813.0, + "step": 8030 + }, + { + "epoch": 1.4913649025069637, + "grad_norm": 1.4392492771148682, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8749823570251465, + "num_tokens": 292805843.0, + "step": 8031 + }, + { + "epoch": 1.4915506035283195, + "grad_norm": 1.5403554439544678, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8819670677185059, + "num_tokens": 292840898.0, + "step": 8032 + }, + { + "epoch": 1.491736304549675, + "grad_norm": 1.467153549194336, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8815317153930664, + "num_tokens": 292880167.0, + "step": 8033 + }, + { + "epoch": 1.4919220055710307, + "grad_norm": 1.4973957538604736, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8734172582626343, + "num_tokens": 292919023.0, + "step": 8034 + }, + { + "epoch": 1.4921077065923862, + "grad_norm": 1.4255598783493042, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8844437599182129, + "num_tokens": 292959277.0, + "step": 8035 + }, + { + "epoch": 1.492293407613742, + "grad_norm": 1.516638159751892, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.889676034450531, + "num_tokens": 292993223.0, + "step": 8036 + }, + { + "epoch": 1.4924791086350975, + "grad_norm": 1.519647479057312, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8796835541725159, + "num_tokens": 293028195.0, + "step": 8037 + }, + { + "epoch": 1.492664809656453, + "grad_norm": 1.6501612663269043, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8799476623535156, + "num_tokens": 293059732.0, + "step": 8038 + }, + { + "epoch": 1.4928505106778087, + "grad_norm": 1.606828212738037, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.860971212387085, + "num_tokens": 293100046.0, + "step": 8039 + }, + { + "epoch": 1.4930362116991645, + "grad_norm": 1.7076141834259033, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.872649073600769, + "num_tokens": 293132480.0, + "step": 8040 + }, + { + "epoch": 1.49322191272052, + "grad_norm": 1.6179633140563965, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8744508028030396, + "num_tokens": 293168709.0, + "step": 8041 + }, + { + "epoch": 1.4934076137418755, + "grad_norm": 1.55562424659729, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.882013201713562, + "num_tokens": 293204533.0, + "step": 8042 + }, + { + "epoch": 1.4935933147632312, + "grad_norm": 1.4901496171951294, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.871444821357727, + "num_tokens": 293243578.0, + "step": 8043 + }, + { + "epoch": 1.493779015784587, + "grad_norm": 1.60659658908844, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8659536838531494, + "num_tokens": 293279034.0, + "step": 8044 + }, + { + "epoch": 1.4939647168059424, + "grad_norm": 1.6368335485458374, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8648171424865723, + "num_tokens": 293315758.0, + "step": 8045 + }, + { + "epoch": 1.494150417827298, + "grad_norm": 1.5865980386734009, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8886269927024841, + "num_tokens": 293349499.0, + "step": 8046 + }, + { + "epoch": 1.4943361188486537, + "grad_norm": 1.6079071760177612, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8864039182662964, + "num_tokens": 293385905.0, + "step": 8047 + }, + { + "epoch": 1.4945218198700092, + "grad_norm": 1.726814866065979, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.888146698474884, + "num_tokens": 293416447.0, + "step": 8048 + }, + { + "epoch": 1.494707520891365, + "grad_norm": 1.575149655342102, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.869867742061615, + "num_tokens": 293452279.0, + "step": 8049 + }, + { + "epoch": 1.4948932219127204, + "grad_norm": 1.6068710088729858, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8661717176437378, + "num_tokens": 293486304.0, + "step": 8050 + }, + { + "epoch": 1.4950789229340762, + "grad_norm": 1.5067657232284546, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8832595944404602, + "num_tokens": 293524111.0, + "step": 8051 + }, + { + "epoch": 1.4952646239554317, + "grad_norm": 1.4610167741775513, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8746519088745117, + "num_tokens": 293564477.0, + "step": 8052 + }, + { + "epoch": 1.4954503249767874, + "grad_norm": 1.628288745880127, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8610822558403015, + "num_tokens": 293602986.0, + "step": 8053 + }, + { + "epoch": 1.495636025998143, + "grad_norm": 1.550260305404663, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8650103807449341, + "num_tokens": 293644183.0, + "step": 8054 + }, + { + "epoch": 1.4958217270194987, + "grad_norm": 1.5630775690078735, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8754714131355286, + "num_tokens": 293681847.0, + "step": 8055 + }, + { + "epoch": 1.4960074280408542, + "grad_norm": 1.547453761100769, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8792089819908142, + "num_tokens": 293717139.0, + "step": 8056 + }, + { + "epoch": 1.49619312906221, + "grad_norm": 1.5695738792419434, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8610638380050659, + "num_tokens": 293757916.0, + "step": 8057 + }, + { + "epoch": 1.4963788300835654, + "grad_norm": 1.4853966236114502, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8787308931350708, + "num_tokens": 293794485.0, + "step": 8058 + }, + { + "epoch": 1.4965645311049212, + "grad_norm": 1.497467279434204, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8829373717308044, + "num_tokens": 293833380.0, + "step": 8059 + }, + { + "epoch": 1.4967502321262767, + "grad_norm": 1.6666520833969116, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8810328841209412, + "num_tokens": 293867775.0, + "step": 8060 + }, + { + "epoch": 1.4969359331476322, + "grad_norm": 1.3813503980636597, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8640145063400269, + "num_tokens": 293916487.0, + "step": 8061 + }, + { + "epoch": 1.497121634168988, + "grad_norm": 1.51124906539917, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8850618600845337, + "num_tokens": 293957501.0, + "step": 8062 + }, + { + "epoch": 1.4973073351903436, + "grad_norm": 1.546934962272644, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8844448924064636, + "num_tokens": 293996641.0, + "step": 8063 + }, + { + "epoch": 1.4974930362116992, + "grad_norm": 1.4806352853775024, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8938379287719727, + "num_tokens": 294036131.0, + "step": 8064 + }, + { + "epoch": 1.4976787372330547, + "grad_norm": 1.6225554943084717, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8644082546234131, + "num_tokens": 294072236.0, + "step": 8065 + }, + { + "epoch": 1.4978644382544104, + "grad_norm": 1.593255877494812, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8727489709854126, + "num_tokens": 294108051.0, + "step": 8066 + }, + { + "epoch": 1.4980501392757661, + "grad_norm": 1.7416305541992188, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8856515884399414, + "num_tokens": 294138737.0, + "step": 8067 + }, + { + "epoch": 1.4982358402971216, + "grad_norm": 1.6145191192626953, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8794949054718018, + "num_tokens": 294175572.0, + "step": 8068 + }, + { + "epoch": 1.4984215413184772, + "grad_norm": 1.6480482816696167, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.869275689125061, + "num_tokens": 294213762.0, + "step": 8069 + }, + { + "epoch": 1.498607242339833, + "grad_norm": 1.6295667886734009, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8814549446105957, + "num_tokens": 294248809.0, + "step": 8070 + }, + { + "epoch": 1.4987929433611886, + "grad_norm": 1.4589744806289673, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8792974948883057, + "num_tokens": 294292040.0, + "step": 8071 + }, + { + "epoch": 1.4989786443825441, + "grad_norm": 1.5583233833312988, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8671180605888367, + "num_tokens": 294329284.0, + "step": 8072 + }, + { + "epoch": 1.4991643454038996, + "grad_norm": 1.5727944374084473, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8860200643539429, + "num_tokens": 294363901.0, + "step": 8073 + }, + { + "epoch": 1.4993500464252554, + "grad_norm": 1.69015371799469, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8939327001571655, + "num_tokens": 294395205.0, + "step": 8074 + }, + { + "epoch": 1.499535747446611, + "grad_norm": 1.6061151027679443, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8588268756866455, + "num_tokens": 294434104.0, + "step": 8075 + }, + { + "epoch": 1.4997214484679666, + "grad_norm": 1.4901992082595825, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8885160088539124, + "num_tokens": 294469266.0, + "step": 8076 + }, + { + "epoch": 1.4999071494893221, + "grad_norm": 1.6592791080474854, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8822019696235657, + "num_tokens": 294500968.0, + "step": 8077 + }, + { + "epoch": 1.5000928505106779, + "grad_norm": 1.5587272644042969, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8821690678596497, + "num_tokens": 294532687.0, + "step": 8078 + }, + { + "epoch": 1.5002785515320334, + "grad_norm": 1.4958916902542114, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8751696348190308, + "num_tokens": 294572440.0, + "step": 8079 + }, + { + "epoch": 1.5004642525533889, + "grad_norm": 1.477164387702942, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8772637248039246, + "num_tokens": 294610549.0, + "step": 8080 + }, + { + "epoch": 1.5006499535747446, + "grad_norm": 1.4649062156677246, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8785786628723145, + "num_tokens": 294649386.0, + "step": 8081 + }, + { + "epoch": 1.5008356545961004, + "grad_norm": 1.711279273033142, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.877734899520874, + "num_tokens": 294678574.0, + "step": 8082 + }, + { + "epoch": 1.5010213556174559, + "grad_norm": 1.4793919324874878, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8859491944313049, + "num_tokens": 294716531.0, + "step": 8083 + }, + { + "epoch": 1.5012070566388114, + "grad_norm": 1.5209671258926392, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8719521760940552, + "num_tokens": 294752470.0, + "step": 8084 + }, + { + "epoch": 1.501392757660167, + "grad_norm": 1.5169744491577148, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8816956281661987, + "num_tokens": 294792010.0, + "step": 8085 + }, + { + "epoch": 1.5015784586815228, + "grad_norm": 1.6068358421325684, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8589789867401123, + "num_tokens": 294829397.0, + "step": 8086 + }, + { + "epoch": 1.5017641597028784, + "grad_norm": 1.6516996622085571, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8818752765655518, + "num_tokens": 294865701.0, + "step": 8087 + }, + { + "epoch": 1.5019498607242339, + "grad_norm": 1.628938913345337, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8706144094467163, + "num_tokens": 294904671.0, + "step": 8088 + }, + { + "epoch": 1.5021355617455896, + "grad_norm": 1.6024291515350342, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8742161989212036, + "num_tokens": 294939420.0, + "step": 8089 + }, + { + "epoch": 1.5023212627669453, + "grad_norm": 1.7700809240341187, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.868955135345459, + "num_tokens": 294970468.0, + "step": 8090 + }, + { + "epoch": 1.5025069637883008, + "grad_norm": 1.5494662523269653, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8777569532394409, + "num_tokens": 295005688.0, + "step": 8091 + }, + { + "epoch": 1.5026926648096564, + "grad_norm": 1.6466343402862549, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8803680539131165, + "num_tokens": 295037514.0, + "step": 8092 + }, + { + "epoch": 1.502878365831012, + "grad_norm": 1.6443088054656982, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.86387699842453, + "num_tokens": 295075234.0, + "step": 8093 + }, + { + "epoch": 1.5030640668523678, + "grad_norm": 1.4334018230438232, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8915137052536011, + "num_tokens": 295114051.0, + "step": 8094 + }, + { + "epoch": 1.5032497678737233, + "grad_norm": 1.5832812786102295, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8878946304321289, + "num_tokens": 295147548.0, + "step": 8095 + }, + { + "epoch": 1.5034354688950788, + "grad_norm": 1.4793493747711182, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.888157069683075, + "num_tokens": 295183499.0, + "step": 8096 + }, + { + "epoch": 1.5036211699164346, + "grad_norm": 1.5848413705825806, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8610247373580933, + "num_tokens": 295221838.0, + "step": 8097 + }, + { + "epoch": 1.5038068709377903, + "grad_norm": 1.5515522956848145, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8728165030479431, + "num_tokens": 295258786.0, + "step": 8098 + }, + { + "epoch": 1.5039925719591458, + "grad_norm": 1.4364274740219116, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8874583840370178, + "num_tokens": 295297832.0, + "step": 8099 + }, + { + "epoch": 1.5041782729805013, + "grad_norm": 1.477702021598816, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8798900842666626, + "num_tokens": 295337330.0, + "step": 8100 + }, + { + "epoch": 1.504363974001857, + "grad_norm": 1.5922706127166748, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.876644492149353, + "num_tokens": 295373260.0, + "step": 8101 + }, + { + "epoch": 1.5045496750232126, + "grad_norm": 1.5758732557296753, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.871536374092102, + "num_tokens": 295408492.0, + "step": 8102 + }, + { + "epoch": 1.504735376044568, + "grad_norm": 1.4729889631271362, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8827462792396545, + "num_tokens": 295442253.0, + "step": 8103 + }, + { + "epoch": 1.5049210770659238, + "grad_norm": 1.5057824850082397, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8802844285964966, + "num_tokens": 295479325.0, + "step": 8104 + }, + { + "epoch": 1.5051067780872796, + "grad_norm": 1.4608668088912964, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8886610269546509, + "num_tokens": 295513724.0, + "step": 8105 + }, + { + "epoch": 1.505292479108635, + "grad_norm": 1.574204921722412, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8831623792648315, + "num_tokens": 295548377.0, + "step": 8106 + }, + { + "epoch": 1.5054781801299906, + "grad_norm": 1.6345034837722778, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.876990556716919, + "num_tokens": 295580898.0, + "step": 8107 + }, + { + "epoch": 1.5056638811513463, + "grad_norm": 1.8416696786880493, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.873586893081665, + "num_tokens": 295611825.0, + "step": 8108 + }, + { + "epoch": 1.505849582172702, + "grad_norm": 1.6676545143127441, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8714739680290222, + "num_tokens": 295649374.0, + "step": 8109 + }, + { + "epoch": 1.5060352831940576, + "grad_norm": 1.5158500671386719, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8815360069274902, + "num_tokens": 295685586.0, + "step": 8110 + }, + { + "epoch": 1.506220984215413, + "grad_norm": 1.6923160552978516, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8646080493927002, + "num_tokens": 295719799.0, + "step": 8111 + }, + { + "epoch": 1.5064066852367688, + "grad_norm": 1.735020637512207, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8800086975097656, + "num_tokens": 295750223.0, + "step": 8112 + }, + { + "epoch": 1.5065923862581245, + "grad_norm": 1.6088556051254272, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8800140023231506, + "num_tokens": 295787200.0, + "step": 8113 + }, + { + "epoch": 1.50677808727948, + "grad_norm": 1.6685668230056763, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8575575351715088, + "num_tokens": 295823795.0, + "step": 8114 + }, + { + "epoch": 1.5069637883008355, + "grad_norm": 1.559694528579712, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8825972676277161, + "num_tokens": 295856217.0, + "step": 8115 + }, + { + "epoch": 1.5071494893221913, + "grad_norm": 1.5994833707809448, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8848440051078796, + "num_tokens": 295886942.0, + "step": 8116 + }, + { + "epoch": 1.507335190343547, + "grad_norm": 1.7024869918823242, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8848742842674255, + "num_tokens": 295921365.0, + "step": 8117 + }, + { + "epoch": 1.5075208913649025, + "grad_norm": 1.5988826751708984, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8693689107894897, + "num_tokens": 295959211.0, + "step": 8118 + }, + { + "epoch": 1.507706592386258, + "grad_norm": 1.5722060203552246, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8668438196182251, + "num_tokens": 295996253.0, + "step": 8119 + }, + { + "epoch": 1.5078922934076138, + "grad_norm": 1.6418514251708984, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8810538053512573, + "num_tokens": 296029661.0, + "step": 8120 + }, + { + "epoch": 1.5080779944289695, + "grad_norm": 1.5657352209091187, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8769547939300537, + "num_tokens": 296063239.0, + "step": 8121 + }, + { + "epoch": 1.508263695450325, + "grad_norm": 1.5320687294006348, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8728224039077759, + "num_tokens": 296102836.0, + "step": 8122 + }, + { + "epoch": 1.5084493964716805, + "grad_norm": 1.5714130401611328, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8771483898162842, + "num_tokens": 296138760.0, + "step": 8123 + }, + { + "epoch": 1.5086350974930363, + "grad_norm": 1.8635348081588745, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8839091062545776, + "num_tokens": 296176404.0, + "step": 8124 + }, + { + "epoch": 1.508820798514392, + "grad_norm": 1.5101094245910645, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8744595050811768, + "num_tokens": 296212101.0, + "step": 8125 + }, + { + "epoch": 1.5090064995357473, + "grad_norm": 1.5148985385894775, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.882604718208313, + "num_tokens": 296250518.0, + "step": 8126 + }, + { + "epoch": 1.509192200557103, + "grad_norm": 1.501420259475708, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8830885291099548, + "num_tokens": 296287630.0, + "step": 8127 + }, + { + "epoch": 1.5093779015784587, + "grad_norm": 1.563256025314331, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8771345615386963, + "num_tokens": 296323465.0, + "step": 8128 + }, + { + "epoch": 1.5095636025998143, + "grad_norm": 1.6497166156768799, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8770542144775391, + "num_tokens": 296360258.0, + "step": 8129 + }, + { + "epoch": 1.5097493036211698, + "grad_norm": 1.6041408777236938, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8860288858413696, + "num_tokens": 296392868.0, + "step": 8130 + }, + { + "epoch": 1.5099350046425255, + "grad_norm": 1.5970728397369385, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8801784515380859, + "num_tokens": 296425828.0, + "step": 8131 + }, + { + "epoch": 1.5101207056638812, + "grad_norm": 1.5505595207214355, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8676180839538574, + "num_tokens": 296461840.0, + "step": 8132 + }, + { + "epoch": 1.5103064066852367, + "grad_norm": 1.7431203126907349, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.876714825630188, + "num_tokens": 296492090.0, + "step": 8133 + }, + { + "epoch": 1.5104921077065923, + "grad_norm": 1.4401648044586182, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8810020089149475, + "num_tokens": 296534379.0, + "step": 8134 + }, + { + "epoch": 1.510677808727948, + "grad_norm": 1.4497157335281372, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8819369673728943, + "num_tokens": 296573203.0, + "step": 8135 + }, + { + "epoch": 1.5108635097493037, + "grad_norm": 1.5859880447387695, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.865513265132904, + "num_tokens": 296607421.0, + "step": 8136 + }, + { + "epoch": 1.5110492107706592, + "grad_norm": 1.5264281034469604, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.891828179359436, + "num_tokens": 296644682.0, + "step": 8137 + }, + { + "epoch": 1.5112349117920147, + "grad_norm": 1.540903925895691, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.871143639087677, + "num_tokens": 296686256.0, + "step": 8138 + }, + { + "epoch": 1.5114206128133705, + "grad_norm": 1.4625558853149414, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8653092384338379, + "num_tokens": 296727871.0, + "step": 8139 + }, + { + "epoch": 1.5116063138347262, + "grad_norm": 1.580375075340271, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8777969479560852, + "num_tokens": 296764475.0, + "step": 8140 + }, + { + "epoch": 1.5117920148560817, + "grad_norm": 1.631468415260315, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8666731119155884, + "num_tokens": 296800928.0, + "step": 8141 + }, + { + "epoch": 1.5119777158774372, + "grad_norm": 1.6125580072402954, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8920905590057373, + "num_tokens": 296833612.0, + "step": 8142 + }, + { + "epoch": 1.512163416898793, + "grad_norm": 1.646974802017212, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8832523226737976, + "num_tokens": 296868881.0, + "step": 8143 + }, + { + "epoch": 1.5123491179201487, + "grad_norm": 1.5013123750686646, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8664777278900146, + "num_tokens": 296910028.0, + "step": 8144 + }, + { + "epoch": 1.5125348189415042, + "grad_norm": 1.5876693725585938, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8598036766052246, + "num_tokens": 296950533.0, + "step": 8145 + }, + { + "epoch": 1.5127205199628597, + "grad_norm": 1.516494631767273, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8647205829620361, + "num_tokens": 296989536.0, + "step": 8146 + }, + { + "epoch": 1.5129062209842155, + "grad_norm": 1.4829570055007935, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8822624683380127, + "num_tokens": 297025699.0, + "step": 8147 + }, + { + "epoch": 1.5130919220055712, + "grad_norm": 1.5053167343139648, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8733644485473633, + "num_tokens": 297060525.0, + "step": 8148 + }, + { + "epoch": 1.5132776230269267, + "grad_norm": 1.578705906867981, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8821515440940857, + "num_tokens": 297095620.0, + "step": 8149 + }, + { + "epoch": 1.5134633240482822, + "grad_norm": 1.734969139099121, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8627729415893555, + "num_tokens": 297131212.0, + "step": 8150 + }, + { + "epoch": 1.513649025069638, + "grad_norm": 1.5896165370941162, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8787122964859009, + "num_tokens": 297166938.0, + "step": 8151 + }, + { + "epoch": 1.5138347260909935, + "grad_norm": 1.5740904808044434, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8821831941604614, + "num_tokens": 297199931.0, + "step": 8152 + }, + { + "epoch": 1.514020427112349, + "grad_norm": 1.5420840978622437, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8782551884651184, + "num_tokens": 297235617.0, + "step": 8153 + }, + { + "epoch": 1.5142061281337047, + "grad_norm": 1.6260945796966553, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8764387369155884, + "num_tokens": 297266994.0, + "step": 8154 + }, + { + "epoch": 1.5143918291550604, + "grad_norm": 1.5379408597946167, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8872476816177368, + "num_tokens": 297302387.0, + "step": 8155 + }, + { + "epoch": 1.514577530176416, + "grad_norm": 1.4467206001281738, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8790254592895508, + "num_tokens": 297341907.0, + "step": 8156 + }, + { + "epoch": 1.5147632311977715, + "grad_norm": 1.5826375484466553, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8617734909057617, + "num_tokens": 297381445.0, + "step": 8157 + }, + { + "epoch": 1.5149489322191272, + "grad_norm": 1.5884618759155273, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.886975884437561, + "num_tokens": 297414134.0, + "step": 8158 + }, + { + "epoch": 1.515134633240483, + "grad_norm": 1.636622667312622, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8644374012947083, + "num_tokens": 297452817.0, + "step": 8159 + }, + { + "epoch": 1.5153203342618384, + "grad_norm": 1.667203664779663, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.876529335975647, + "num_tokens": 297486066.0, + "step": 8160 + }, + { + "epoch": 1.515506035283194, + "grad_norm": 1.4550285339355469, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8916498422622681, + "num_tokens": 297523716.0, + "step": 8161 + }, + { + "epoch": 1.5156917363045497, + "grad_norm": 1.7082897424697876, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8699051141738892, + "num_tokens": 297554051.0, + "step": 8162 + }, + { + "epoch": 1.5158774373259054, + "grad_norm": 1.4395442008972168, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8738109469413757, + "num_tokens": 297598916.0, + "step": 8163 + }, + { + "epoch": 1.516063138347261, + "grad_norm": 1.5106275081634521, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8713778257369995, + "num_tokens": 297638597.0, + "step": 8164 + }, + { + "epoch": 1.5162488393686164, + "grad_norm": 1.5219700336456299, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8890882134437561, + "num_tokens": 297672472.0, + "step": 8165 + }, + { + "epoch": 1.5164345403899722, + "grad_norm": 1.5639106035232544, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.886967658996582, + "num_tokens": 297708917.0, + "step": 8166 + }, + { + "epoch": 1.516620241411328, + "grad_norm": 1.4585413932800293, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.876834511756897, + "num_tokens": 297747521.0, + "step": 8167 + }, + { + "epoch": 1.5168059424326834, + "grad_norm": 1.5539661645889282, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8834271430969238, + "num_tokens": 297783468.0, + "step": 8168 + }, + { + "epoch": 1.516991643454039, + "grad_norm": 1.6136428117752075, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8874523639678955, + "num_tokens": 297814432.0, + "step": 8169 + }, + { + "epoch": 1.5171773444753947, + "grad_norm": 1.5327666997909546, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8755025863647461, + "num_tokens": 297854023.0, + "step": 8170 + }, + { + "epoch": 1.5173630454967504, + "grad_norm": 1.6407668590545654, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.859208345413208, + "num_tokens": 297892424.0, + "step": 8171 + }, + { + "epoch": 1.517548746518106, + "grad_norm": 1.469164252281189, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8971667289733887, + "num_tokens": 297925868.0, + "step": 8172 + }, + { + "epoch": 1.5177344475394614, + "grad_norm": 1.6019314527511597, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8825588226318359, + "num_tokens": 297967264.0, + "step": 8173 + }, + { + "epoch": 1.5179201485608171, + "grad_norm": 1.4633241891860962, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8937311172485352, + "num_tokens": 298005507.0, + "step": 8174 + }, + { + "epoch": 1.5181058495821727, + "grad_norm": 1.683995246887207, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8620651364326477, + "num_tokens": 298041163.0, + "step": 8175 + }, + { + "epoch": 1.5182915506035282, + "grad_norm": 1.4468863010406494, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8966687917709351, + "num_tokens": 298079670.0, + "step": 8176 + }, + { + "epoch": 1.518477251624884, + "grad_norm": 1.5811938047409058, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8744788765907288, + "num_tokens": 298115830.0, + "step": 8177 + }, + { + "epoch": 1.5186629526462396, + "grad_norm": 1.456320881843567, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8907017111778259, + "num_tokens": 298150862.0, + "step": 8178 + }, + { + "epoch": 1.5188486536675951, + "grad_norm": 1.4154436588287354, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8860591650009155, + "num_tokens": 298188307.0, + "step": 8179 + }, + { + "epoch": 1.5190343546889506, + "grad_norm": 1.5093417167663574, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8698749542236328, + "num_tokens": 298227805.0, + "step": 8180 + }, + { + "epoch": 1.5192200557103064, + "grad_norm": 1.653113603591919, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8622375130653381, + "num_tokens": 298263746.0, + "step": 8181 + }, + { + "epoch": 1.5194057567316621, + "grad_norm": 1.4869472980499268, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8702657222747803, + "num_tokens": 298302156.0, + "step": 8182 + }, + { + "epoch": 1.5195914577530176, + "grad_norm": 1.5373008251190186, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8652779459953308, + "num_tokens": 298338865.0, + "step": 8183 + }, + { + "epoch": 1.5197771587743731, + "grad_norm": 1.52000093460083, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8742455244064331, + "num_tokens": 298376836.0, + "step": 8184 + }, + { + "epoch": 1.5199628597957289, + "grad_norm": 1.454915165901184, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8726826310157776, + "num_tokens": 298416185.0, + "step": 8185 + }, + { + "epoch": 1.5201485608170846, + "grad_norm": 1.5680387020111084, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8833128213882446, + "num_tokens": 298453657.0, + "step": 8186 + }, + { + "epoch": 1.5203342618384401, + "grad_norm": 1.6810284852981567, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8751778602600098, + "num_tokens": 298486770.0, + "step": 8187 + }, + { + "epoch": 1.5205199628597956, + "grad_norm": 1.4122904539108276, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8889428377151489, + "num_tokens": 298525407.0, + "step": 8188 + }, + { + "epoch": 1.5207056638811514, + "grad_norm": 1.5571681261062622, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8908752202987671, + "num_tokens": 298558814.0, + "step": 8189 + }, + { + "epoch": 1.520891364902507, + "grad_norm": 1.5236140489578247, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8777512311935425, + "num_tokens": 298594454.0, + "step": 8190 + }, + { + "epoch": 1.5210770659238626, + "grad_norm": 1.5145500898361206, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8559541702270508, + "num_tokens": 298636704.0, + "step": 8191 + }, + { + "epoch": 1.5212627669452181, + "grad_norm": 1.633991003036499, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8602359294891357, + "num_tokens": 298674586.0, + "step": 8192 + }, + { + "epoch": 1.5214484679665738, + "grad_norm": 1.532089114189148, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8871734142303467, + "num_tokens": 298709654.0, + "step": 8193 + }, + { + "epoch": 1.5216341689879296, + "grad_norm": 1.5610240697860718, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8774569034576416, + "num_tokens": 298745288.0, + "step": 8194 + }, + { + "epoch": 1.521819870009285, + "grad_norm": 1.5524394512176514, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8891951441764832, + "num_tokens": 298777374.0, + "step": 8195 + }, + { + "epoch": 1.5220055710306406, + "grad_norm": 1.4633897542953491, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8891058564186096, + "num_tokens": 298813760.0, + "step": 8196 + }, + { + "epoch": 1.5221912720519963, + "grad_norm": 1.6250135898590088, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8755694627761841, + "num_tokens": 298843198.0, + "step": 8197 + }, + { + "epoch": 1.522376973073352, + "grad_norm": 1.587291955947876, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8662823438644409, + "num_tokens": 298884802.0, + "step": 8198 + }, + { + "epoch": 1.5225626740947074, + "grad_norm": 1.4588502645492554, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8859513998031616, + "num_tokens": 298920410.0, + "step": 8199 + }, + { + "epoch": 1.522748375116063, + "grad_norm": 1.5027801990509033, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8860097527503967, + "num_tokens": 298956436.0, + "step": 8200 + }, + { + "epoch": 1.5229340761374188, + "grad_norm": 1.7565871477127075, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8578986525535583, + "num_tokens": 298985662.0, + "step": 8201 + }, + { + "epoch": 1.5231197771587743, + "grad_norm": 1.5213183164596558, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8714923858642578, + "num_tokens": 299021613.0, + "step": 8202 + }, + { + "epoch": 1.5233054781801298, + "grad_norm": 1.6086790561676025, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8824871778488159, + "num_tokens": 299057407.0, + "step": 8203 + }, + { + "epoch": 1.5234911792014856, + "grad_norm": 1.6659307479858398, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.877515435218811, + "num_tokens": 299092223.0, + "step": 8204 + }, + { + "epoch": 1.5236768802228413, + "grad_norm": 1.3869644403457642, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8848666548728943, + "num_tokens": 299134286.0, + "step": 8205 + }, + { + "epoch": 1.5238625812441968, + "grad_norm": 1.5830475091934204, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8745166063308716, + "num_tokens": 299167625.0, + "step": 8206 + }, + { + "epoch": 1.5240482822655523, + "grad_norm": 1.3532111644744873, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8842083811759949, + "num_tokens": 299211669.0, + "step": 8207 + }, + { + "epoch": 1.524233983286908, + "grad_norm": 1.636931300163269, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8689290285110474, + "num_tokens": 299244904.0, + "step": 8208 + }, + { + "epoch": 1.5244196843082638, + "grad_norm": 1.557111144065857, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8885923624038696, + "num_tokens": 299279148.0, + "step": 8209 + }, + { + "epoch": 1.5246053853296193, + "grad_norm": 1.5636014938354492, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8765271306037903, + "num_tokens": 299316553.0, + "step": 8210 + }, + { + "epoch": 1.5247910863509748, + "grad_norm": 1.4583196640014648, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.868116557598114, + "num_tokens": 299359457.0, + "step": 8211 + }, + { + "epoch": 1.5249767873723306, + "grad_norm": 1.6214784383773804, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8746023178100586, + "num_tokens": 299394350.0, + "step": 8212 + }, + { + "epoch": 1.5251624883936863, + "grad_norm": 1.5330876111984253, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8699931502342224, + "num_tokens": 299432524.0, + "step": 8213 + }, + { + "epoch": 1.5253481894150418, + "grad_norm": 1.4731582403182983, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8763754367828369, + "num_tokens": 299472939.0, + "step": 8214 + }, + { + "epoch": 1.5255338904363973, + "grad_norm": 1.5644989013671875, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8776538372039795, + "num_tokens": 299509297.0, + "step": 8215 + }, + { + "epoch": 1.525719591457753, + "grad_norm": 1.5675913095474243, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8620157837867737, + "num_tokens": 299543737.0, + "step": 8216 + }, + { + "epoch": 1.5259052924791088, + "grad_norm": 1.689154863357544, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8758350014686584, + "num_tokens": 299577063.0, + "step": 8217 + }, + { + "epoch": 1.5260909935004643, + "grad_norm": 1.4363014698028564, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8772140741348267, + "num_tokens": 299616854.0, + "step": 8218 + }, + { + "epoch": 1.5262766945218198, + "grad_norm": 1.511788249015808, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8661430478096008, + "num_tokens": 299657515.0, + "step": 8219 + }, + { + "epoch": 1.5264623955431755, + "grad_norm": 1.554727554321289, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8692047595977783, + "num_tokens": 299696534.0, + "step": 8220 + }, + { + "epoch": 1.5266480965645313, + "grad_norm": 1.4744248390197754, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8708475828170776, + "num_tokens": 299737451.0, + "step": 8221 + }, + { + "epoch": 1.5268337975858868, + "grad_norm": 1.5025053024291992, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8705410957336426, + "num_tokens": 299779794.0, + "step": 8222 + }, + { + "epoch": 1.5270194986072423, + "grad_norm": 1.6150530576705933, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8726884126663208, + "num_tokens": 299817213.0, + "step": 8223 + }, + { + "epoch": 1.527205199628598, + "grad_norm": 1.5772384405136108, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8814870715141296, + "num_tokens": 299851849.0, + "step": 8224 + }, + { + "epoch": 1.5273909006499535, + "grad_norm": 1.590678334236145, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.863967776298523, + "num_tokens": 299894058.0, + "step": 8225 + }, + { + "epoch": 1.527576601671309, + "grad_norm": 1.4397897720336914, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.891097903251648, + "num_tokens": 299932955.0, + "step": 8226 + }, + { + "epoch": 1.5277623026926648, + "grad_norm": 1.634229063987732, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8868273496627808, + "num_tokens": 299964142.0, + "step": 8227 + }, + { + "epoch": 1.5279480037140205, + "grad_norm": 1.717262625694275, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8736684918403625, + "num_tokens": 299996788.0, + "step": 8228 + }, + { + "epoch": 1.528133704735376, + "grad_norm": 1.7119461297988892, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8664447069168091, + "num_tokens": 300029561.0, + "step": 8229 + }, + { + "epoch": 1.5283194057567315, + "grad_norm": 1.6401607990264893, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8822410106658936, + "num_tokens": 300060688.0, + "step": 8230 + }, + { + "epoch": 1.5285051067780873, + "grad_norm": 1.6300764083862305, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8536968231201172, + "num_tokens": 300095450.0, + "step": 8231 + }, + { + "epoch": 1.528690807799443, + "grad_norm": 1.581233024597168, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8828498721122742, + "num_tokens": 300131775.0, + "step": 8232 + }, + { + "epoch": 1.5288765088207985, + "grad_norm": 1.6986234188079834, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.86752849817276, + "num_tokens": 300166491.0, + "step": 8233 + }, + { + "epoch": 1.529062209842154, + "grad_norm": 1.6435881853103638, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8678802251815796, + "num_tokens": 300205606.0, + "step": 8234 + }, + { + "epoch": 1.5292479108635098, + "grad_norm": 1.5381367206573486, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8783557415008545, + "num_tokens": 300246334.0, + "step": 8235 + }, + { + "epoch": 1.5294336118848655, + "grad_norm": 1.5876030921936035, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8950419425964355, + "num_tokens": 300278544.0, + "step": 8236 + }, + { + "epoch": 1.529619312906221, + "grad_norm": 1.4802740812301636, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8835433125495911, + "num_tokens": 300319003.0, + "step": 8237 + }, + { + "epoch": 1.5298050139275765, + "grad_norm": 1.665245771408081, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.869044840335846, + "num_tokens": 300351606.0, + "step": 8238 + }, + { + "epoch": 1.5299907149489322, + "grad_norm": 1.6765278577804565, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8777353167533875, + "num_tokens": 300387133.0, + "step": 8239 + }, + { + "epoch": 1.530176415970288, + "grad_norm": 1.5297046899795532, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8640718460083008, + "num_tokens": 300432589.0, + "step": 8240 + }, + { + "epoch": 1.5303621169916435, + "grad_norm": 1.571578025817871, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8823714256286621, + "num_tokens": 300466649.0, + "step": 8241 + }, + { + "epoch": 1.530547818012999, + "grad_norm": 1.6623470783233643, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8670990467071533, + "num_tokens": 300503190.0, + "step": 8242 + }, + { + "epoch": 1.5307335190343547, + "grad_norm": 1.6428896188735962, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8734433650970459, + "num_tokens": 300542206.0, + "step": 8243 + }, + { + "epoch": 1.5309192200557105, + "grad_norm": 1.6876925230026245, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8723539113998413, + "num_tokens": 300575785.0, + "step": 8244 + }, + { + "epoch": 1.531104921077066, + "grad_norm": 1.4088268280029297, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.9021345376968384, + "num_tokens": 300614470.0, + "step": 8245 + }, + { + "epoch": 1.5312906220984215, + "grad_norm": 1.2876182794570923, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8922064900398254, + "num_tokens": 300660037.0, + "step": 8246 + }, + { + "epoch": 1.5314763231197772, + "grad_norm": 1.57538902759552, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8787891268730164, + "num_tokens": 300696310.0, + "step": 8247 + }, + { + "epoch": 1.5316620241411327, + "grad_norm": 1.4852367639541626, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.889957070350647, + "num_tokens": 300730444.0, + "step": 8248 + }, + { + "epoch": 1.5318477251624882, + "grad_norm": 1.5391230583190918, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8824132680892944, + "num_tokens": 300765938.0, + "step": 8249 + }, + { + "epoch": 1.532033426183844, + "grad_norm": 1.530949354171753, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8842601776123047, + "num_tokens": 300799365.0, + "step": 8250 + }, + { + "epoch": 1.5322191272051997, + "grad_norm": 1.538076639175415, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8797545433044434, + "num_tokens": 300833037.0, + "step": 8251 + }, + { + "epoch": 1.5324048282265552, + "grad_norm": 1.4789190292358398, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8713440299034119, + "num_tokens": 300873090.0, + "step": 8252 + }, + { + "epoch": 1.5325905292479107, + "grad_norm": 1.6866999864578247, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8852723836898804, + "num_tokens": 300907185.0, + "step": 8253 + }, + { + "epoch": 1.5327762302692665, + "grad_norm": 1.5347648859024048, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8689177632331848, + "num_tokens": 300947885.0, + "step": 8254 + }, + { + "epoch": 1.5329619312906222, + "grad_norm": 1.5668941736221313, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8846891522407532, + "num_tokens": 300987827.0, + "step": 8255 + }, + { + "epoch": 1.5331476323119777, + "grad_norm": 1.6647887229919434, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8645130395889282, + "num_tokens": 301024281.0, + "step": 8256 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 1.4970653057098389, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8732274770736694, + "num_tokens": 301062990.0, + "step": 8257 + }, + { + "epoch": 1.533519034354689, + "grad_norm": 1.499855875968933, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8737589120864868, + "num_tokens": 301103060.0, + "step": 8258 + }, + { + "epoch": 1.5337047353760447, + "grad_norm": 1.524839162826538, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8851040005683899, + "num_tokens": 301143861.0, + "step": 8259 + }, + { + "epoch": 1.5338904363974002, + "grad_norm": 1.5540015697479248, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.865323007106781, + "num_tokens": 301182481.0, + "step": 8260 + }, + { + "epoch": 1.5340761374187557, + "grad_norm": 1.450895071029663, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8808106780052185, + "num_tokens": 301226989.0, + "step": 8261 + }, + { + "epoch": 1.5342618384401114, + "grad_norm": 1.5968780517578125, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8857858180999756, + "num_tokens": 301258337.0, + "step": 8262 + }, + { + "epoch": 1.5344475394614672, + "grad_norm": 1.5035333633422852, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.887316107749939, + "num_tokens": 301296204.0, + "step": 8263 + }, + { + "epoch": 1.5346332404828227, + "grad_norm": 1.5169340372085571, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8763629794120789, + "num_tokens": 301332633.0, + "step": 8264 + }, + { + "epoch": 1.5348189415041782, + "grad_norm": 1.5647141933441162, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8552702069282532, + "num_tokens": 301370110.0, + "step": 8265 + }, + { + "epoch": 1.535004642525534, + "grad_norm": 1.4223947525024414, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8761127591133118, + "num_tokens": 301409996.0, + "step": 8266 + }, + { + "epoch": 1.5351903435468897, + "grad_norm": 1.5490586757659912, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.868187665939331, + "num_tokens": 301449502.0, + "step": 8267 + }, + { + "epoch": 1.5353760445682452, + "grad_norm": 1.5106642246246338, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.880251407623291, + "num_tokens": 301490110.0, + "step": 8268 + }, + { + "epoch": 1.5355617455896007, + "grad_norm": 1.7059122323989868, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8603789806365967, + "num_tokens": 301522815.0, + "step": 8269 + }, + { + "epoch": 1.5357474466109564, + "grad_norm": 1.7204853296279907, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8899038434028625, + "num_tokens": 301550222.0, + "step": 8270 + }, + { + "epoch": 1.535933147632312, + "grad_norm": 1.6326360702514648, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.86628258228302, + "num_tokens": 301585487.0, + "step": 8271 + }, + { + "epoch": 1.5361188486536674, + "grad_norm": 1.644405722618103, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8712445497512817, + "num_tokens": 301624036.0, + "step": 8272 + }, + { + "epoch": 1.5363045496750232, + "grad_norm": 1.4798063039779663, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8928042650222778, + "num_tokens": 301660499.0, + "step": 8273 + }, + { + "epoch": 1.536490250696379, + "grad_norm": 1.5929468870162964, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8872205018997192, + "num_tokens": 301693432.0, + "step": 8274 + }, + { + "epoch": 1.5366759517177344, + "grad_norm": 1.5545042753219604, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8837820887565613, + "num_tokens": 301728107.0, + "step": 8275 + }, + { + "epoch": 1.53686165273909, + "grad_norm": 1.3817888498306274, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8896290063858032, + "num_tokens": 301768097.0, + "step": 8276 + }, + { + "epoch": 1.5370473537604457, + "grad_norm": 1.5061501264572144, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.884573221206665, + "num_tokens": 301804872.0, + "step": 8277 + }, + { + "epoch": 1.5372330547818014, + "grad_norm": 1.9065158367156982, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8693995475769043, + "num_tokens": 301831259.0, + "step": 8278 + }, + { + "epoch": 1.537418755803157, + "grad_norm": 1.6593800783157349, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8839125037193298, + "num_tokens": 301862415.0, + "step": 8279 + }, + { + "epoch": 1.5376044568245124, + "grad_norm": 1.4419034719467163, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.880220890045166, + "num_tokens": 301902711.0, + "step": 8280 + }, + { + "epoch": 1.5377901578458681, + "grad_norm": 1.3831942081451416, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8897185325622559, + "num_tokens": 301945839.0, + "step": 8281 + }, + { + "epoch": 1.5379758588672239, + "grad_norm": 1.7485761642456055, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8824422359466553, + "num_tokens": 301977653.0, + "step": 8282 + }, + { + "epoch": 1.5381615598885794, + "grad_norm": 1.6088632345199585, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8840252161026001, + "num_tokens": 302009216.0, + "step": 8283 + }, + { + "epoch": 1.538347260909935, + "grad_norm": 1.529487133026123, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.885414719581604, + "num_tokens": 302043794.0, + "step": 8284 + }, + { + "epoch": 1.5385329619312906, + "grad_norm": 1.5619312524795532, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.892470121383667, + "num_tokens": 302074285.0, + "step": 8285 + }, + { + "epoch": 1.5387186629526464, + "grad_norm": 1.4995607137680054, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8778899908065796, + "num_tokens": 302111994.0, + "step": 8286 + }, + { + "epoch": 1.5389043639740019, + "grad_norm": 1.5164368152618408, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8864378929138184, + "num_tokens": 302146107.0, + "step": 8287 + }, + { + "epoch": 1.5390900649953574, + "grad_norm": 1.4658161401748657, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8707771301269531, + "num_tokens": 302185818.0, + "step": 8288 + }, + { + "epoch": 1.5392757660167131, + "grad_norm": 1.6902538537979126, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8799788355827332, + "num_tokens": 302216083.0, + "step": 8289 + }, + { + "epoch": 1.5394614670380689, + "grad_norm": 1.6097017526626587, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8788502216339111, + "num_tokens": 302250951.0, + "step": 8290 + }, + { + "epoch": 1.5396471680594244, + "grad_norm": 1.589290738105774, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8650075197219849, + "num_tokens": 302289262.0, + "step": 8291 + }, + { + "epoch": 1.5398328690807799, + "grad_norm": 1.5901283025741577, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8867781162261963, + "num_tokens": 302320606.0, + "step": 8292 + }, + { + "epoch": 1.5400185701021356, + "grad_norm": 1.500712513923645, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.87827467918396, + "num_tokens": 302355763.0, + "step": 8293 + }, + { + "epoch": 1.5402042711234913, + "grad_norm": 1.5497534275054932, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8537999391555786, + "num_tokens": 302395750.0, + "step": 8294 + }, + { + "epoch": 1.5403899721448466, + "grad_norm": 1.6901037693023682, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8843117356300354, + "num_tokens": 302429026.0, + "step": 8295 + }, + { + "epoch": 1.5405756731662024, + "grad_norm": 1.4463231563568115, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8902619481086731, + "num_tokens": 302465497.0, + "step": 8296 + }, + { + "epoch": 1.540761374187558, + "grad_norm": 1.6588249206542969, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8720663785934448, + "num_tokens": 302500976.0, + "step": 8297 + }, + { + "epoch": 1.5409470752089136, + "grad_norm": 1.6776556968688965, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8618809580802917, + "num_tokens": 302534526.0, + "step": 8298 + }, + { + "epoch": 1.5411327762302691, + "grad_norm": 1.4702112674713135, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8634599447250366, + "num_tokens": 302575946.0, + "step": 8299 + }, + { + "epoch": 1.5413184772516249, + "grad_norm": 1.4560644626617432, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8819109201431274, + "num_tokens": 302615234.0, + "step": 8300 + }, + { + "epoch": 1.5415041782729806, + "grad_norm": 1.4268511533737183, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8805727958679199, + "num_tokens": 302655729.0, + "step": 8301 + }, + { + "epoch": 1.541689879294336, + "grad_norm": 1.627524733543396, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8710353374481201, + "num_tokens": 302690507.0, + "step": 8302 + }, + { + "epoch": 1.5418755803156916, + "grad_norm": 1.5524173974990845, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8701399564743042, + "num_tokens": 302726560.0, + "step": 8303 + }, + { + "epoch": 1.5420612813370473, + "grad_norm": 1.5514671802520752, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8684776425361633, + "num_tokens": 302766087.0, + "step": 8304 + }, + { + "epoch": 1.542246982358403, + "grad_norm": 1.492188811302185, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8940245509147644, + "num_tokens": 302799806.0, + "step": 8305 + }, + { + "epoch": 1.5424326833797586, + "grad_norm": 1.4771310091018677, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8765393495559692, + "num_tokens": 302839965.0, + "step": 8306 + }, + { + "epoch": 1.542618384401114, + "grad_norm": 1.649705171585083, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8870145082473755, + "num_tokens": 302873105.0, + "step": 8307 + }, + { + "epoch": 1.5428040854224698, + "grad_norm": 1.5183128118515015, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.89082270860672, + "num_tokens": 302908797.0, + "step": 8308 + }, + { + "epoch": 1.5429897864438256, + "grad_norm": 1.5662705898284912, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8797491192817688, + "num_tokens": 302943428.0, + "step": 8309 + }, + { + "epoch": 1.543175487465181, + "grad_norm": 1.5374058485031128, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8855526447296143, + "num_tokens": 302980804.0, + "step": 8310 + }, + { + "epoch": 1.5433611884865366, + "grad_norm": 1.6609035730361938, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8814009428024292, + "num_tokens": 303012079.0, + "step": 8311 + }, + { + "epoch": 1.5435468895078923, + "grad_norm": 1.6006016731262207, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8797523379325867, + "num_tokens": 303048007.0, + "step": 8312 + }, + { + "epoch": 1.543732590529248, + "grad_norm": 1.4528743028640747, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.884614109992981, + "num_tokens": 303090400.0, + "step": 8313 + }, + { + "epoch": 1.5439182915506036, + "grad_norm": 1.6056259870529175, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8779933452606201, + "num_tokens": 303122102.0, + "step": 8314 + }, + { + "epoch": 1.544103992571959, + "grad_norm": 1.524577260017395, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8801721334457397, + "num_tokens": 303158081.0, + "step": 8315 + }, + { + "epoch": 1.5442896935933148, + "grad_norm": 1.504037618637085, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8750823736190796, + "num_tokens": 303199937.0, + "step": 8316 + }, + { + "epoch": 1.5444753946146705, + "grad_norm": 1.5570662021636963, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8727244138717651, + "num_tokens": 303239497.0, + "step": 8317 + }, + { + "epoch": 1.544661095636026, + "grad_norm": 1.4565106630325317, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8928920030593872, + "num_tokens": 303273883.0, + "step": 8318 + }, + { + "epoch": 1.5448467966573816, + "grad_norm": 1.4578793048858643, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8685779571533203, + "num_tokens": 303314670.0, + "step": 8319 + }, + { + "epoch": 1.5450324976787373, + "grad_norm": 1.5673408508300781, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8688439130783081, + "num_tokens": 303349084.0, + "step": 8320 + }, + { + "epoch": 1.5452181987000928, + "grad_norm": 1.4952439069747925, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8752164840698242, + "num_tokens": 303389740.0, + "step": 8321 + }, + { + "epoch": 1.5454038997214483, + "grad_norm": 1.6085009574890137, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8853751420974731, + "num_tokens": 303425126.0, + "step": 8322 + }, + { + "epoch": 1.545589600742804, + "grad_norm": 1.4647221565246582, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8768192529678345, + "num_tokens": 303467900.0, + "step": 8323 + }, + { + "epoch": 1.5457753017641598, + "grad_norm": 1.5611337423324585, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8661335110664368, + "num_tokens": 303504469.0, + "step": 8324 + }, + { + "epoch": 1.5459610027855153, + "grad_norm": 1.6901205778121948, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8701627254486084, + "num_tokens": 303542912.0, + "step": 8325 + }, + { + "epoch": 1.5461467038068708, + "grad_norm": 1.47969388961792, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8898416757583618, + "num_tokens": 303579926.0, + "step": 8326 + }, + { + "epoch": 1.5463324048282265, + "grad_norm": 1.5469223260879517, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8653137683868408, + "num_tokens": 303615557.0, + "step": 8327 + }, + { + "epoch": 1.5465181058495823, + "grad_norm": 1.6936368942260742, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8759229183197021, + "num_tokens": 303648907.0, + "step": 8328 + }, + { + "epoch": 1.5467038068709378, + "grad_norm": 1.539644479751587, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8844419717788696, + "num_tokens": 303685507.0, + "step": 8329 + }, + { + "epoch": 1.5468895078922933, + "grad_norm": 1.4722187519073486, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8875882625579834, + "num_tokens": 303725989.0, + "step": 8330 + }, + { + "epoch": 1.547075208913649, + "grad_norm": 1.5274488925933838, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8919821977615356, + "num_tokens": 303764084.0, + "step": 8331 + }, + { + "epoch": 1.5472609099350048, + "grad_norm": 1.5910364389419556, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8669586777687073, + "num_tokens": 303802057.0, + "step": 8332 + }, + { + "epoch": 1.5474466109563603, + "grad_norm": 1.4735490083694458, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8923177719116211, + "num_tokens": 303838707.0, + "step": 8333 + }, + { + "epoch": 1.5476323119777158, + "grad_norm": 1.5960471630096436, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8666398525238037, + "num_tokens": 303873146.0, + "step": 8334 + }, + { + "epoch": 1.5478180129990715, + "grad_norm": 1.5612916946411133, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8644026517868042, + "num_tokens": 303909567.0, + "step": 8335 + }, + { + "epoch": 1.5480037140204272, + "grad_norm": 1.5347648859024048, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8838289976119995, + "num_tokens": 303946205.0, + "step": 8336 + }, + { + "epoch": 1.5481894150417828, + "grad_norm": 1.4962202310562134, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.88193279504776, + "num_tokens": 303981858.0, + "step": 8337 + }, + { + "epoch": 1.5483751160631383, + "grad_norm": 1.596323847770691, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8759618997573853, + "num_tokens": 304015106.0, + "step": 8338 + }, + { + "epoch": 1.548560817084494, + "grad_norm": 1.7323400974273682, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8854069709777832, + "num_tokens": 304045851.0, + "step": 8339 + }, + { + "epoch": 1.5487465181058497, + "grad_norm": 1.7440921068191528, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8837277889251709, + "num_tokens": 304076038.0, + "step": 8340 + }, + { + "epoch": 1.5489322191272052, + "grad_norm": 1.6011651754379272, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.876222550868988, + "num_tokens": 304112476.0, + "step": 8341 + }, + { + "epoch": 1.5491179201485608, + "grad_norm": 1.5121327638626099, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8845244646072388, + "num_tokens": 304148592.0, + "step": 8342 + }, + { + "epoch": 1.5493036211699165, + "grad_norm": 1.4481385946273804, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8899844884872437, + "num_tokens": 304186334.0, + "step": 8343 + }, + { + "epoch": 1.549489322191272, + "grad_norm": 1.6576449871063232, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8745273351669312, + "num_tokens": 304218854.0, + "step": 8344 + }, + { + "epoch": 1.5496750232126275, + "grad_norm": 1.566876769065857, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8924242258071899, + "num_tokens": 304255017.0, + "step": 8345 + }, + { + "epoch": 1.5498607242339832, + "grad_norm": 1.5520237684249878, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8897879719734192, + "num_tokens": 304292860.0, + "step": 8346 + }, + { + "epoch": 1.550046425255339, + "grad_norm": 1.5786439180374146, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8960373401641846, + "num_tokens": 304328575.0, + "step": 8347 + }, + { + "epoch": 1.5502321262766945, + "grad_norm": 1.7500911951065063, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8730818033218384, + "num_tokens": 304357830.0, + "step": 8348 + }, + { + "epoch": 1.55041782729805, + "grad_norm": 1.5096266269683838, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8963031768798828, + "num_tokens": 304390986.0, + "step": 8349 + }, + { + "epoch": 1.5506035283194057, + "grad_norm": 1.6051743030548096, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8846024870872498, + "num_tokens": 304426650.0, + "step": 8350 + }, + { + "epoch": 1.5507892293407615, + "grad_norm": 1.5991847515106201, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8739101886749268, + "num_tokens": 304465329.0, + "step": 8351 + }, + { + "epoch": 1.550974930362117, + "grad_norm": 1.528818130493164, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.890005350112915, + "num_tokens": 304499760.0, + "step": 8352 + }, + { + "epoch": 1.5511606313834725, + "grad_norm": 1.6941440105438232, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8727392554283142, + "num_tokens": 304532055.0, + "step": 8353 + }, + { + "epoch": 1.5513463324048282, + "grad_norm": 1.5514886379241943, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8774248361587524, + "num_tokens": 304568921.0, + "step": 8354 + }, + { + "epoch": 1.551532033426184, + "grad_norm": 1.510254144668579, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8832933902740479, + "num_tokens": 304605434.0, + "step": 8355 + }, + { + "epoch": 1.5517177344475395, + "grad_norm": 1.7218097448349, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8558269739151001, + "num_tokens": 304640279.0, + "step": 8356 + }, + { + "epoch": 1.551903435468895, + "grad_norm": 1.65204656124115, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.878233790397644, + "num_tokens": 304673630.0, + "step": 8357 + }, + { + "epoch": 1.5520891364902507, + "grad_norm": 1.523070216178894, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.873659074306488, + "num_tokens": 304713517.0, + "step": 8358 + }, + { + "epoch": 1.5522748375116064, + "grad_norm": 1.5675599575042725, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8796601891517639, + "num_tokens": 304748577.0, + "step": 8359 + }, + { + "epoch": 1.552460538532962, + "grad_norm": 1.4165440797805786, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.9036052227020264, + "num_tokens": 304784607.0, + "step": 8360 + }, + { + "epoch": 1.5526462395543175, + "grad_norm": 1.501883625984192, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8562159538269043, + "num_tokens": 304824805.0, + "step": 8361 + }, + { + "epoch": 1.5528319405756732, + "grad_norm": 1.4727694988250732, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8738372325897217, + "num_tokens": 304867223.0, + "step": 8362 + }, + { + "epoch": 1.553017641597029, + "grad_norm": 1.5633025169372559, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8753982782363892, + "num_tokens": 304904347.0, + "step": 8363 + }, + { + "epoch": 1.5532033426183844, + "grad_norm": 1.5250035524368286, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8803902268409729, + "num_tokens": 304944931.0, + "step": 8364 + }, + { + "epoch": 1.55338904363974, + "grad_norm": 1.6334863901138306, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8802343606948853, + "num_tokens": 304977486.0, + "step": 8365 + }, + { + "epoch": 1.5535747446610957, + "grad_norm": 1.4911514520645142, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8907642364501953, + "num_tokens": 305014572.0, + "step": 8366 + }, + { + "epoch": 1.5537604456824514, + "grad_norm": 1.4365276098251343, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8858708143234253, + "num_tokens": 305051391.0, + "step": 8367 + }, + { + "epoch": 1.5539461467038067, + "grad_norm": 1.5680180788040161, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8845557570457458, + "num_tokens": 305089493.0, + "step": 8368 + }, + { + "epoch": 1.5541318477251624, + "grad_norm": 1.4551633596420288, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8808760643005371, + "num_tokens": 305127815.0, + "step": 8369 + }, + { + "epoch": 1.5543175487465182, + "grad_norm": 1.6319535970687866, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8763740062713623, + "num_tokens": 305160297.0, + "step": 8370 + }, + { + "epoch": 1.5545032497678737, + "grad_norm": 1.556851863861084, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8818410038948059, + "num_tokens": 305197847.0, + "step": 8371 + }, + { + "epoch": 1.5546889507892292, + "grad_norm": 1.530591607093811, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8876322507858276, + "num_tokens": 305233468.0, + "step": 8372 + }, + { + "epoch": 1.554874651810585, + "grad_norm": 1.4623209238052368, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8700531721115112, + "num_tokens": 305275687.0, + "step": 8373 + }, + { + "epoch": 1.5550603528319407, + "grad_norm": 1.7376168966293335, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8561758399009705, + "num_tokens": 305308789.0, + "step": 8374 + }, + { + "epoch": 1.5552460538532962, + "grad_norm": 1.4350253343582153, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8679642081260681, + "num_tokens": 305352154.0, + "step": 8375 + }, + { + "epoch": 1.5554317548746517, + "grad_norm": 1.461323857307434, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8789079785346985, + "num_tokens": 305391613.0, + "step": 8376 + }, + { + "epoch": 1.5556174558960074, + "grad_norm": 1.4874190092086792, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8839824795722961, + "num_tokens": 305430428.0, + "step": 8377 + }, + { + "epoch": 1.5558031569173632, + "grad_norm": 1.5718683004379272, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8740774393081665, + "num_tokens": 305468829.0, + "step": 8378 + }, + { + "epoch": 1.5559888579387187, + "grad_norm": 1.7703323364257812, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8798516988754272, + "num_tokens": 305498385.0, + "step": 8379 + }, + { + "epoch": 1.5561745589600742, + "grad_norm": 1.6000237464904785, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8860504031181335, + "num_tokens": 305532863.0, + "step": 8380 + }, + { + "epoch": 1.55636025998143, + "grad_norm": 1.4215713739395142, + "learning_rate": 1e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.9005975127220154, + "num_tokens": 305570067.0, + "step": 8381 + }, + { + "epoch": 1.5565459610027856, + "grad_norm": 1.5937494039535522, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8811988830566406, + "num_tokens": 305607806.0, + "step": 8382 + }, + { + "epoch": 1.5567316620241411, + "grad_norm": 1.6478102207183838, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8739578723907471, + "num_tokens": 305646809.0, + "step": 8383 + }, + { + "epoch": 1.5569173630454967, + "grad_norm": 1.6023869514465332, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8664973974227905, + "num_tokens": 305686409.0, + "step": 8384 + }, + { + "epoch": 1.5571030640668524, + "grad_norm": 1.5258060693740845, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8851943612098694, + "num_tokens": 305723833.0, + "step": 8385 + }, + { + "epoch": 1.5572887650882081, + "grad_norm": 1.5759867429733276, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8801796436309814, + "num_tokens": 305758263.0, + "step": 8386 + }, + { + "epoch": 1.5574744661095636, + "grad_norm": 1.4654910564422607, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8782147765159607, + "num_tokens": 305798031.0, + "step": 8387 + }, + { + "epoch": 1.5576601671309191, + "grad_norm": 1.8191769123077393, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8783584833145142, + "num_tokens": 305831517.0, + "step": 8388 + }, + { + "epoch": 1.5578458681522749, + "grad_norm": 1.6209527254104614, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.889451265335083, + "num_tokens": 305862815.0, + "step": 8389 + }, + { + "epoch": 1.5580315691736306, + "grad_norm": 1.2782193422317505, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8969204425811768, + "num_tokens": 305902770.0, + "step": 8390 + }, + { + "epoch": 1.5582172701949861, + "grad_norm": 1.561232566833496, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8684051036834717, + "num_tokens": 305938756.0, + "step": 8391 + }, + { + "epoch": 1.5584029712163416, + "grad_norm": 1.3525129556655884, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8832274079322815, + "num_tokens": 305982786.0, + "step": 8392 + }, + { + "epoch": 1.5585886722376974, + "grad_norm": 1.4733467102050781, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8743575811386108, + "num_tokens": 306020402.0, + "step": 8393 + }, + { + "epoch": 1.5587743732590529, + "grad_norm": 1.654492974281311, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8781049847602844, + "num_tokens": 306054639.0, + "step": 8394 + }, + { + "epoch": 1.5589600742804084, + "grad_norm": 1.5228421688079834, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8733265399932861, + "num_tokens": 306094399.0, + "step": 8395 + }, + { + "epoch": 1.5591457753017641, + "grad_norm": 1.5453537702560425, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8716229200363159, + "num_tokens": 306133054.0, + "step": 8396 + }, + { + "epoch": 1.5593314763231199, + "grad_norm": 1.501009464263916, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8860000371932983, + "num_tokens": 306171276.0, + "step": 8397 + }, + { + "epoch": 1.5595171773444754, + "grad_norm": 1.6419320106506348, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8980726003646851, + "num_tokens": 306201005.0, + "step": 8398 + }, + { + "epoch": 1.5597028783658309, + "grad_norm": 1.4870001077651978, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8818573355674744, + "num_tokens": 306241222.0, + "step": 8399 + }, + { + "epoch": 1.5598885793871866, + "grad_norm": 1.4615198373794556, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8738168478012085, + "num_tokens": 306285941.0, + "step": 8400 + }, + { + "epoch": 1.5600742804085423, + "grad_norm": 1.6777729988098145, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8737808465957642, + "num_tokens": 306315363.0, + "step": 8401 + }, + { + "epoch": 1.5602599814298979, + "grad_norm": 1.388726830482483, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.880174458026886, + "num_tokens": 306358500.0, + "step": 8402 + }, + { + "epoch": 1.5604456824512534, + "grad_norm": 1.4864985942840576, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8806139230728149, + "num_tokens": 306396265.0, + "step": 8403 + }, + { + "epoch": 1.560631383472609, + "grad_norm": 1.5088289976119995, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8815101385116577, + "num_tokens": 306434010.0, + "step": 8404 + }, + { + "epoch": 1.5608170844939648, + "grad_norm": 1.5818617343902588, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8791295289993286, + "num_tokens": 306469660.0, + "step": 8405 + }, + { + "epoch": 1.5610027855153203, + "grad_norm": 1.5883384943008423, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8722434639930725, + "num_tokens": 306502581.0, + "step": 8406 + }, + { + "epoch": 1.5611884865366759, + "grad_norm": 1.6845002174377441, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8702619075775146, + "num_tokens": 306533404.0, + "step": 8407 + }, + { + "epoch": 1.5613741875580316, + "grad_norm": 1.6298704147338867, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8785617351531982, + "num_tokens": 306567447.0, + "step": 8408 + }, + { + "epoch": 1.5615598885793873, + "grad_norm": 1.530011773109436, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8838704228401184, + "num_tokens": 306603662.0, + "step": 8409 + }, + { + "epoch": 1.5617455896007428, + "grad_norm": 1.5314949750900269, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8780965805053711, + "num_tokens": 306641574.0, + "step": 8410 + }, + { + "epoch": 1.5619312906220983, + "grad_norm": 1.5148215293884277, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.87628173828125, + "num_tokens": 306680038.0, + "step": 8411 + }, + { + "epoch": 1.562116991643454, + "grad_norm": 1.6610066890716553, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.875605583190918, + "num_tokens": 306712587.0, + "step": 8412 + }, + { + "epoch": 1.5623026926648098, + "grad_norm": 1.5140407085418701, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.893950343132019, + "num_tokens": 306743743.0, + "step": 8413 + }, + { + "epoch": 1.5624883936861653, + "grad_norm": 1.4487316608428955, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8732205629348755, + "num_tokens": 306784070.0, + "step": 8414 + }, + { + "epoch": 1.5626740947075208, + "grad_norm": 1.5368828773498535, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8835334181785583, + "num_tokens": 306817161.0, + "step": 8415 + }, + { + "epoch": 1.5628597957288766, + "grad_norm": 1.527372121810913, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8827576637268066, + "num_tokens": 306853671.0, + "step": 8416 + }, + { + "epoch": 1.563045496750232, + "grad_norm": 1.5613292455673218, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8832811713218689, + "num_tokens": 306889935.0, + "step": 8417 + }, + { + "epoch": 1.5632311977715876, + "grad_norm": 1.5616075992584229, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8731863498687744, + "num_tokens": 306927638.0, + "step": 8418 + }, + { + "epoch": 1.5634168987929433, + "grad_norm": 1.4139517545700073, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8923117518424988, + "num_tokens": 306968027.0, + "step": 8419 + }, + { + "epoch": 1.563602599814299, + "grad_norm": 1.7213455438613892, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8630445599555969, + "num_tokens": 307001121.0, + "step": 8420 + }, + { + "epoch": 1.5637883008356546, + "grad_norm": 1.5575839281082153, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8758998513221741, + "num_tokens": 307040005.0, + "step": 8421 + }, + { + "epoch": 1.56397400185701, + "grad_norm": 1.6555098295211792, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8683227896690369, + "num_tokens": 307072798.0, + "step": 8422 + }, + { + "epoch": 1.5641597028783658, + "grad_norm": 1.6063201427459717, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8596546053886414, + "num_tokens": 307111096.0, + "step": 8423 + }, + { + "epoch": 1.5643454038997215, + "grad_norm": 1.4961628913879395, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8829048871994019, + "num_tokens": 307146180.0, + "step": 8424 + }, + { + "epoch": 1.564531104921077, + "grad_norm": 1.6719989776611328, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8772411346435547, + "num_tokens": 307178582.0, + "step": 8425 + }, + { + "epoch": 1.5647168059424326, + "grad_norm": 1.5572737455368042, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8851221799850464, + "num_tokens": 307212680.0, + "step": 8426 + }, + { + "epoch": 1.5649025069637883, + "grad_norm": 1.6447978019714355, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8838975429534912, + "num_tokens": 307251416.0, + "step": 8427 + }, + { + "epoch": 1.565088207985144, + "grad_norm": 1.5722618103027344, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8814191818237305, + "num_tokens": 307285309.0, + "step": 8428 + }, + { + "epoch": 1.5652739090064995, + "grad_norm": 1.512816071510315, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8820891380310059, + "num_tokens": 307319609.0, + "step": 8429 + }, + { + "epoch": 1.565459610027855, + "grad_norm": 1.5183219909667969, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8723645210266113, + "num_tokens": 307359699.0, + "step": 8430 + }, + { + "epoch": 1.5656453110492108, + "grad_norm": 1.4344401359558105, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8837977647781372, + "num_tokens": 307396780.0, + "step": 8431 + }, + { + "epoch": 1.5658310120705665, + "grad_norm": 1.343587040901184, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8946464657783508, + "num_tokens": 307438893.0, + "step": 8432 + }, + { + "epoch": 1.566016713091922, + "grad_norm": 1.4956873655319214, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8845642805099487, + "num_tokens": 307473958.0, + "step": 8433 + }, + { + "epoch": 1.5662024141132775, + "grad_norm": 1.49971604347229, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8687011003494263, + "num_tokens": 307514164.0, + "step": 8434 + }, + { + "epoch": 1.5663881151346333, + "grad_norm": 1.5931743383407593, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8552875518798828, + "num_tokens": 307553407.0, + "step": 8435 + }, + { + "epoch": 1.566573816155989, + "grad_norm": 1.4535348415374756, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.875959038734436, + "num_tokens": 307594074.0, + "step": 8436 + }, + { + "epoch": 1.5667595171773445, + "grad_norm": 1.5292882919311523, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8876968026161194, + "num_tokens": 307629034.0, + "step": 8437 + }, + { + "epoch": 1.5669452181987, + "grad_norm": 1.519838809967041, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8798802495002747, + "num_tokens": 307667674.0, + "step": 8438 + }, + { + "epoch": 1.5671309192200558, + "grad_norm": 1.4988380670547485, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8764376640319824, + "num_tokens": 307707656.0, + "step": 8439 + }, + { + "epoch": 1.5673166202414113, + "grad_norm": 1.4561015367507935, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8865227699279785, + "num_tokens": 307746727.0, + "step": 8440 + }, + { + "epoch": 1.5675023212627668, + "grad_norm": 1.7509335279464722, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8649562001228333, + "num_tokens": 307780493.0, + "step": 8441 + }, + { + "epoch": 1.5676880222841225, + "grad_norm": 1.5396099090576172, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8746120929718018, + "num_tokens": 307821111.0, + "step": 8442 + }, + { + "epoch": 1.5678737233054783, + "grad_norm": 1.6914918422698975, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8684765100479126, + "num_tokens": 307852812.0, + "step": 8443 + }, + { + "epoch": 1.5680594243268338, + "grad_norm": 1.4688271284103394, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.873810887336731, + "num_tokens": 307891431.0, + "step": 8444 + }, + { + "epoch": 1.5682451253481893, + "grad_norm": 1.8070987462997437, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8470068573951721, + "num_tokens": 307926951.0, + "step": 8445 + }, + { + "epoch": 1.568430826369545, + "grad_norm": 1.6903578042984009, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.881984531879425, + "num_tokens": 307959419.0, + "step": 8446 + }, + { + "epoch": 1.5686165273909007, + "grad_norm": 1.6331148147583008, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8808350563049316, + "num_tokens": 307991653.0, + "step": 8447 + }, + { + "epoch": 1.5688022284122562, + "grad_norm": 1.5760127305984497, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8780625462532043, + "num_tokens": 308029893.0, + "step": 8448 + }, + { + "epoch": 1.5689879294336118, + "grad_norm": 1.6689088344573975, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8769959211349487, + "num_tokens": 308063826.0, + "step": 8449 + }, + { + "epoch": 1.5691736304549675, + "grad_norm": 1.6784467697143555, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8832874298095703, + "num_tokens": 308098431.0, + "step": 8450 + }, + { + "epoch": 1.5693593314763232, + "grad_norm": 1.6868637800216675, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.871688187122345, + "num_tokens": 308131334.0, + "step": 8451 + }, + { + "epoch": 1.5695450324976787, + "grad_norm": 1.6966763734817505, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8557701110839844, + "num_tokens": 308165803.0, + "step": 8452 + }, + { + "epoch": 1.5697307335190342, + "grad_norm": 1.4950326681137085, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8693006038665771, + "num_tokens": 308205184.0, + "step": 8453 + }, + { + "epoch": 1.56991643454039, + "grad_norm": 1.4473648071289062, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8769403100013733, + "num_tokens": 308244555.0, + "step": 8454 + }, + { + "epoch": 1.5701021355617457, + "grad_norm": 1.3618186712265015, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8835222721099854, + "num_tokens": 308288125.0, + "step": 8455 + }, + { + "epoch": 1.5702878365831012, + "grad_norm": 1.6572413444519043, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8666744232177734, + "num_tokens": 308319808.0, + "step": 8456 + }, + { + "epoch": 1.5704735376044567, + "grad_norm": 1.4958851337432861, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.876201331615448, + "num_tokens": 308358879.0, + "step": 8457 + }, + { + "epoch": 1.5706592386258125, + "grad_norm": 1.3969448804855347, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.887606143951416, + "num_tokens": 308397501.0, + "step": 8458 + }, + { + "epoch": 1.5708449396471682, + "grad_norm": 1.6571580171585083, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8844497203826904, + "num_tokens": 308428733.0, + "step": 8459 + }, + { + "epoch": 1.5710306406685237, + "grad_norm": 1.4541109800338745, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8814237117767334, + "num_tokens": 308465955.0, + "step": 8460 + }, + { + "epoch": 1.5712163416898792, + "grad_norm": 1.5031226873397827, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8621092438697815, + "num_tokens": 308507513.0, + "step": 8461 + }, + { + "epoch": 1.571402042711235, + "grad_norm": 1.509381890296936, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8834306001663208, + "num_tokens": 308546754.0, + "step": 8462 + }, + { + "epoch": 1.5715877437325907, + "grad_norm": 1.6216762065887451, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8794608116149902, + "num_tokens": 308578507.0, + "step": 8463 + }, + { + "epoch": 1.571773444753946, + "grad_norm": 1.6590931415557861, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8598025441169739, + "num_tokens": 308612182.0, + "step": 8464 + }, + { + "epoch": 1.5719591457753017, + "grad_norm": 1.5834040641784668, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.869235634803772, + "num_tokens": 308650532.0, + "step": 8465 + }, + { + "epoch": 1.5721448467966574, + "grad_norm": 1.5664671659469604, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.876681387424469, + "num_tokens": 308684936.0, + "step": 8466 + }, + { + "epoch": 1.572330547818013, + "grad_norm": 1.5777477025985718, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8699515461921692, + "num_tokens": 308721358.0, + "step": 8467 + }, + { + "epoch": 1.5725162488393685, + "grad_norm": 1.6659934520721436, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8681893944740295, + "num_tokens": 308756272.0, + "step": 8468 + }, + { + "epoch": 1.5727019498607242, + "grad_norm": 1.6551786661148071, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8757015466690063, + "num_tokens": 308791961.0, + "step": 8469 + }, + { + "epoch": 1.57288765088208, + "grad_norm": 1.6163859367370605, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8813964128494263, + "num_tokens": 308826371.0, + "step": 8470 + }, + { + "epoch": 1.5730733519034354, + "grad_norm": 1.4437159299850464, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8817633390426636, + "num_tokens": 308863650.0, + "step": 8471 + }, + { + "epoch": 1.573259052924791, + "grad_norm": 1.3619884252548218, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8749857544898987, + "num_tokens": 308909930.0, + "step": 8472 + }, + { + "epoch": 1.5734447539461467, + "grad_norm": 1.6838313341140747, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8729454874992371, + "num_tokens": 308945213.0, + "step": 8473 + }, + { + "epoch": 1.5736304549675024, + "grad_norm": 1.6478643417358398, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8804160356521606, + "num_tokens": 308978657.0, + "step": 8474 + }, + { + "epoch": 1.573816155988858, + "grad_norm": 1.5520780086517334, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.877874493598938, + "num_tokens": 309013322.0, + "step": 8475 + }, + { + "epoch": 1.5740018570102134, + "grad_norm": 1.5966554880142212, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8782286643981934, + "num_tokens": 309048728.0, + "step": 8476 + }, + { + "epoch": 1.5741875580315692, + "grad_norm": 1.5546103715896606, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8869426250457764, + "num_tokens": 309082968.0, + "step": 8477 + }, + { + "epoch": 1.574373259052925, + "grad_norm": 1.6900049448013306, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8658746480941772, + "num_tokens": 309116791.0, + "step": 8478 + }, + { + "epoch": 1.5745589600742804, + "grad_norm": 1.5780011415481567, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8703113794326782, + "num_tokens": 309156782.0, + "step": 8479 + }, + { + "epoch": 1.574744661095636, + "grad_norm": 1.5866315364837646, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8797026872634888, + "num_tokens": 309193186.0, + "step": 8480 + }, + { + "epoch": 1.5749303621169917, + "grad_norm": 1.4862208366394043, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8676106333732605, + "num_tokens": 309231865.0, + "step": 8481 + }, + { + "epoch": 1.5751160631383474, + "grad_norm": 1.3295855522155762, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.877091109752655, + "num_tokens": 309278152.0, + "step": 8482 + }, + { + "epoch": 1.575301764159703, + "grad_norm": 1.5548043251037598, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8740705251693726, + "num_tokens": 309314087.0, + "step": 8483 + }, + { + "epoch": 1.5754874651810584, + "grad_norm": 1.63228440284729, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8605182766914368, + "num_tokens": 309348075.0, + "step": 8484 + }, + { + "epoch": 1.5756731662024142, + "grad_norm": 1.649681568145752, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8818128108978271, + "num_tokens": 309380657.0, + "step": 8485 + }, + { + "epoch": 1.5758588672237699, + "grad_norm": 1.3983649015426636, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8850311040878296, + "num_tokens": 309419291.0, + "step": 8486 + }, + { + "epoch": 1.5760445682451254, + "grad_norm": 1.5119625329971313, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.889534056186676, + "num_tokens": 309454792.0, + "step": 8487 + }, + { + "epoch": 1.576230269266481, + "grad_norm": 1.6854335069656372, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8647153377532959, + "num_tokens": 309488875.0, + "step": 8488 + }, + { + "epoch": 1.5764159702878366, + "grad_norm": 1.594580054283142, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8741496205329895, + "num_tokens": 309520266.0, + "step": 8489 + }, + { + "epoch": 1.5766016713091922, + "grad_norm": 1.4712623357772827, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8806795477867126, + "num_tokens": 309562541.0, + "step": 8490 + }, + { + "epoch": 1.5767873723305477, + "grad_norm": 1.637498378753662, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8742182850837708, + "num_tokens": 309597537.0, + "step": 8491 + }, + { + "epoch": 1.5769730733519034, + "grad_norm": 1.68587327003479, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8875815868377686, + "num_tokens": 309626590.0, + "step": 8492 + }, + { + "epoch": 1.5771587743732591, + "grad_norm": 1.5099356174468994, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.892625093460083, + "num_tokens": 309662386.0, + "step": 8493 + }, + { + "epoch": 1.5773444753946146, + "grad_norm": 1.4668428897857666, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8759639859199524, + "num_tokens": 309703939.0, + "step": 8494 + }, + { + "epoch": 1.5775301764159702, + "grad_norm": 1.6870267391204834, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8689862489700317, + "num_tokens": 309737997.0, + "step": 8495 + }, + { + "epoch": 1.5777158774373259, + "grad_norm": 1.6291038990020752, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8832358121871948, + "num_tokens": 309772070.0, + "step": 8496 + }, + { + "epoch": 1.5779015784586816, + "grad_norm": 1.6490039825439453, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8804725408554077, + "num_tokens": 309804362.0, + "step": 8497 + }, + { + "epoch": 1.5780872794800371, + "grad_norm": 1.6001803874969482, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.886407732963562, + "num_tokens": 309837376.0, + "step": 8498 + }, + { + "epoch": 1.5782729805013926, + "grad_norm": 1.5319421291351318, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8889997005462646, + "num_tokens": 309876310.0, + "step": 8499 + }, + { + "epoch": 1.5784586815227484, + "grad_norm": 1.6220741271972656, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.879450261592865, + "num_tokens": 309910510.0, + "step": 8500 + }, + { + "epoch": 1.578644382544104, + "grad_norm": 1.5451958179473877, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8844020366668701, + "num_tokens": 309945451.0, + "step": 8501 + }, + { + "epoch": 1.5788300835654596, + "grad_norm": 1.629198431968689, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8764747381210327, + "num_tokens": 309980695.0, + "step": 8502 + }, + { + "epoch": 1.5790157845868151, + "grad_norm": 1.6061458587646484, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8739383220672607, + "num_tokens": 310016539.0, + "step": 8503 + }, + { + "epoch": 1.5792014856081709, + "grad_norm": 1.532770037651062, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8743515014648438, + "num_tokens": 310057431.0, + "step": 8504 + }, + { + "epoch": 1.5793871866295266, + "grad_norm": 1.3194960355758667, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8880820274353027, + "num_tokens": 310100921.0, + "step": 8505 + }, + { + "epoch": 1.579572887650882, + "grad_norm": 1.6738148927688599, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8657238483428955, + "num_tokens": 310136077.0, + "step": 8506 + }, + { + "epoch": 1.5797585886722376, + "grad_norm": 1.3773030042648315, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8786656856536865, + "num_tokens": 310179852.0, + "step": 8507 + }, + { + "epoch": 1.5799442896935934, + "grad_norm": 1.5516482591629028, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8767762184143066, + "num_tokens": 310215699.0, + "step": 8508 + }, + { + "epoch": 1.580129990714949, + "grad_norm": 1.7518885135650635, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8836315870285034, + "num_tokens": 310243294.0, + "step": 8509 + }, + { + "epoch": 1.5803156917363046, + "grad_norm": 1.6675347089767456, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8787847757339478, + "num_tokens": 310275662.0, + "step": 8510 + }, + { + "epoch": 1.58050139275766, + "grad_norm": 1.4875909090042114, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8800855278968811, + "num_tokens": 310313546.0, + "step": 8511 + }, + { + "epoch": 1.5806870937790158, + "grad_norm": 1.6348655223846436, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8676109910011292, + "num_tokens": 310347508.0, + "step": 8512 + }, + { + "epoch": 1.5808727948003714, + "grad_norm": 1.6427655220031738, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8703382611274719, + "num_tokens": 310382516.0, + "step": 8513 + }, + { + "epoch": 1.5810584958217269, + "grad_norm": 1.734968662261963, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8543590307235718, + "num_tokens": 310414568.0, + "step": 8514 + }, + { + "epoch": 1.5812441968430826, + "grad_norm": 1.398027777671814, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8916172385215759, + "num_tokens": 310452080.0, + "step": 8515 + }, + { + "epoch": 1.5814298978644383, + "grad_norm": 1.5200618505477905, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8491989374160767, + "num_tokens": 310492176.0, + "step": 8516 + }, + { + "epoch": 1.5816155988857938, + "grad_norm": 1.6842950582504272, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.866202175617218, + "num_tokens": 310526695.0, + "step": 8517 + }, + { + "epoch": 1.5818012999071493, + "grad_norm": 1.561794638633728, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8748764991760254, + "num_tokens": 310566096.0, + "step": 8518 + }, + { + "epoch": 1.581987000928505, + "grad_norm": 1.5809458494186401, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.868508517742157, + "num_tokens": 310602376.0, + "step": 8519 + }, + { + "epoch": 1.5821727019498608, + "grad_norm": 1.491050362586975, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.887699544429779, + "num_tokens": 310637697.0, + "step": 8520 + }, + { + "epoch": 1.5823584029712163, + "grad_norm": 1.4945414066314697, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8659777641296387, + "num_tokens": 310675213.0, + "step": 8521 + }, + { + "epoch": 1.5825441039925718, + "grad_norm": 1.525739073753357, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8732973337173462, + "num_tokens": 310712095.0, + "step": 8522 + }, + { + "epoch": 1.5827298050139276, + "grad_norm": 1.494998574256897, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8749384880065918, + "num_tokens": 310747496.0, + "step": 8523 + }, + { + "epoch": 1.5829155060352833, + "grad_norm": 1.6494005918502808, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8807346820831299, + "num_tokens": 310779086.0, + "step": 8524 + }, + { + "epoch": 1.5831012070566388, + "grad_norm": 1.5579211711883545, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8845275044441223, + "num_tokens": 310816120.0, + "step": 8525 + }, + { + "epoch": 1.5832869080779943, + "grad_norm": 1.4780057668685913, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8795552849769592, + "num_tokens": 310857636.0, + "step": 8526 + }, + { + "epoch": 1.58347260909935, + "grad_norm": 1.6018086671829224, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8732961416244507, + "num_tokens": 310895008.0, + "step": 8527 + }, + { + "epoch": 1.5836583101207058, + "grad_norm": 1.3725361824035645, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8908820152282715, + "num_tokens": 310932380.0, + "step": 8528 + }, + { + "epoch": 1.5838440111420613, + "grad_norm": 1.4207161664962769, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8998764753341675, + "num_tokens": 310969436.0, + "step": 8529 + }, + { + "epoch": 1.5840297121634168, + "grad_norm": 1.4649569988250732, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.877223014831543, + "num_tokens": 311007854.0, + "step": 8530 + }, + { + "epoch": 1.5842154131847725, + "grad_norm": 1.8627122640609741, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8742117285728455, + "num_tokens": 311036002.0, + "step": 8531 + }, + { + "epoch": 1.5844011142061283, + "grad_norm": 1.4661582708358765, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8830515146255493, + "num_tokens": 311076298.0, + "step": 8532 + }, + { + "epoch": 1.5845868152274838, + "grad_norm": 1.6789062023162842, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8875943422317505, + "num_tokens": 311107552.0, + "step": 8533 + }, + { + "epoch": 1.5847725162488393, + "grad_norm": 1.6389001607894897, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8635789155960083, + "num_tokens": 311140913.0, + "step": 8534 + }, + { + "epoch": 1.584958217270195, + "grad_norm": 1.5320422649383545, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8654749989509583, + "num_tokens": 311180946.0, + "step": 8535 + }, + { + "epoch": 1.5851439182915508, + "grad_norm": 1.6156827211380005, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8703341484069824, + "num_tokens": 311221181.0, + "step": 8536 + }, + { + "epoch": 1.585329619312906, + "grad_norm": 1.4855916500091553, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8758825063705444, + "num_tokens": 311260725.0, + "step": 8537 + }, + { + "epoch": 1.5855153203342618, + "grad_norm": 1.6232283115386963, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8829072713851929, + "num_tokens": 311294383.0, + "step": 8538 + }, + { + "epoch": 1.5857010213556175, + "grad_norm": 1.6263115406036377, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8667221069335938, + "num_tokens": 311328148.0, + "step": 8539 + }, + { + "epoch": 1.585886722376973, + "grad_norm": 1.5181803703308105, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8798761367797852, + "num_tokens": 311362231.0, + "step": 8540 + }, + { + "epoch": 1.5860724233983285, + "grad_norm": 1.4416710138320923, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8726989030838013, + "num_tokens": 311402636.0, + "step": 8541 + }, + { + "epoch": 1.5862581244196843, + "grad_norm": 1.471230387687683, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8846033811569214, + "num_tokens": 311441512.0, + "step": 8542 + }, + { + "epoch": 1.58644382544104, + "grad_norm": 1.5550471544265747, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8805861473083496, + "num_tokens": 311476426.0, + "step": 8543 + }, + { + "epoch": 1.5866295264623955, + "grad_norm": 1.7164323329925537, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8794098496437073, + "num_tokens": 311506477.0, + "step": 8544 + }, + { + "epoch": 1.586815227483751, + "grad_norm": 1.445643424987793, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8963326215744019, + "num_tokens": 311541302.0, + "step": 8545 + }, + { + "epoch": 1.5870009285051068, + "grad_norm": 1.537250280380249, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8828033208847046, + "num_tokens": 311577757.0, + "step": 8546 + }, + { + "epoch": 1.5871866295264625, + "grad_norm": 1.8737893104553223, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8634783029556274, + "num_tokens": 311611609.0, + "step": 8547 + }, + { + "epoch": 1.587372330547818, + "grad_norm": 1.6704410314559937, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8716484308242798, + "num_tokens": 311644904.0, + "step": 8548 + }, + { + "epoch": 1.5875580315691735, + "grad_norm": 1.606015920639038, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8752714395523071, + "num_tokens": 311679254.0, + "step": 8549 + }, + { + "epoch": 1.5877437325905293, + "grad_norm": 1.69967782497406, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8706749677658081, + "num_tokens": 311710269.0, + "step": 8550 + }, + { + "epoch": 1.587929433611885, + "grad_norm": 1.5721005201339722, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.887715220451355, + "num_tokens": 311741694.0, + "step": 8551 + }, + { + "epoch": 1.5881151346332405, + "grad_norm": 1.6004821062088013, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8597981333732605, + "num_tokens": 311779568.0, + "step": 8552 + }, + { + "epoch": 1.588300835654596, + "grad_norm": 1.635926604270935, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8746349811553955, + "num_tokens": 311812489.0, + "step": 8553 + }, + { + "epoch": 1.5884865366759517, + "grad_norm": 1.6894360780715942, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8727105855941772, + "num_tokens": 311847500.0, + "step": 8554 + }, + { + "epoch": 1.5886722376973075, + "grad_norm": 1.5963622331619263, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8763949275016785, + "num_tokens": 311882632.0, + "step": 8555 + }, + { + "epoch": 1.588857938718663, + "grad_norm": 1.5918618440628052, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8754059672355652, + "num_tokens": 311915732.0, + "step": 8556 + }, + { + "epoch": 1.5890436397400185, + "grad_norm": 1.5736945867538452, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8728281259536743, + "num_tokens": 311951266.0, + "step": 8557 + }, + { + "epoch": 1.5892293407613742, + "grad_norm": 1.4834175109863281, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8901482224464417, + "num_tokens": 311987543.0, + "step": 8558 + }, + { + "epoch": 1.58941504178273, + "grad_norm": 1.6371458768844604, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8752496838569641, + "num_tokens": 312022765.0, + "step": 8559 + }, + { + "epoch": 1.5896007428040855, + "grad_norm": 1.6352922916412354, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8809103965759277, + "num_tokens": 312058945.0, + "step": 8560 + }, + { + "epoch": 1.589786443825441, + "grad_norm": 1.4576075077056885, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8772108554840088, + "num_tokens": 312095610.0, + "step": 8561 + }, + { + "epoch": 1.5899721448467967, + "grad_norm": 1.471064567565918, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8803860545158386, + "num_tokens": 312134453.0, + "step": 8562 + }, + { + "epoch": 1.5901578458681522, + "grad_norm": 1.416787028312683, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.888719916343689, + "num_tokens": 312175416.0, + "step": 8563 + }, + { + "epoch": 1.5903435468895077, + "grad_norm": 1.4356638193130493, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8913853764533997, + "num_tokens": 312215061.0, + "step": 8564 + }, + { + "epoch": 1.5905292479108635, + "grad_norm": 1.5425695180892944, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8817354440689087, + "num_tokens": 312248941.0, + "step": 8565 + }, + { + "epoch": 1.5907149489322192, + "grad_norm": 1.5001394748687744, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8913020491600037, + "num_tokens": 312285157.0, + "step": 8566 + }, + { + "epoch": 1.5909006499535747, + "grad_norm": 1.6277083158493042, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8817145824432373, + "num_tokens": 312318019.0, + "step": 8567 + }, + { + "epoch": 1.5910863509749302, + "grad_norm": 1.5942480564117432, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8919006586074829, + "num_tokens": 312349218.0, + "step": 8568 + }, + { + "epoch": 1.591272051996286, + "grad_norm": 1.4648250341415405, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8864148259162903, + "num_tokens": 312386386.0, + "step": 8569 + }, + { + "epoch": 1.5914577530176417, + "grad_norm": 1.7374873161315918, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8695023059844971, + "num_tokens": 312418166.0, + "step": 8570 + }, + { + "epoch": 1.5916434540389972, + "grad_norm": 1.5632028579711914, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8641485571861267, + "num_tokens": 312455799.0, + "step": 8571 + }, + { + "epoch": 1.5918291550603527, + "grad_norm": 1.602151870727539, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8813828229904175, + "num_tokens": 312492480.0, + "step": 8572 + }, + { + "epoch": 1.5920148560817085, + "grad_norm": 1.6287564039230347, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8766511082649231, + "num_tokens": 312531739.0, + "step": 8573 + }, + { + "epoch": 1.5922005571030642, + "grad_norm": 1.6710641384124756, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8840445280075073, + "num_tokens": 312565297.0, + "step": 8574 + }, + { + "epoch": 1.5923862581244197, + "grad_norm": 1.4446475505828857, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8900666236877441, + "num_tokens": 312606257.0, + "step": 8575 + }, + { + "epoch": 1.5925719591457752, + "grad_norm": 1.5952026844024658, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8820459842681885, + "num_tokens": 312643266.0, + "step": 8576 + }, + { + "epoch": 1.592757660167131, + "grad_norm": 1.5165003538131714, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8715271353721619, + "num_tokens": 312682885.0, + "step": 8577 + }, + { + "epoch": 1.5929433611884867, + "grad_norm": 1.4219545125961304, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8907275199890137, + "num_tokens": 312718109.0, + "step": 8578 + }, + { + "epoch": 1.5931290622098422, + "grad_norm": 1.6771876811981201, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8826435804367065, + "num_tokens": 312747945.0, + "step": 8579 + }, + { + "epoch": 1.5933147632311977, + "grad_norm": 1.4955788850784302, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.892261266708374, + "num_tokens": 312782241.0, + "step": 8580 + }, + { + "epoch": 1.5935004642525534, + "grad_norm": 1.344377040863037, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8786756992340088, + "num_tokens": 312827886.0, + "step": 8581 + }, + { + "epoch": 1.5936861652739092, + "grad_norm": 1.548564076423645, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8694626688957214, + "num_tokens": 312867155.0, + "step": 8582 + }, + { + "epoch": 1.5938718662952647, + "grad_norm": 1.580352783203125, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8656291961669922, + "num_tokens": 312902823.0, + "step": 8583 + }, + { + "epoch": 1.5940575673166202, + "grad_norm": 1.4085487127304077, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.882636308670044, + "num_tokens": 312941521.0, + "step": 8584 + }, + { + "epoch": 1.594243268337976, + "grad_norm": 1.6046174764633179, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8559479713439941, + "num_tokens": 312980556.0, + "step": 8585 + }, + { + "epoch": 1.5944289693593314, + "grad_norm": 1.7298393249511719, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.880153477191925, + "num_tokens": 313012275.0, + "step": 8586 + }, + { + "epoch": 1.594614670380687, + "grad_norm": 1.6070473194122314, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8746548891067505, + "num_tokens": 313051082.0, + "step": 8587 + }, + { + "epoch": 1.5948003714020427, + "grad_norm": 1.5183181762695312, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8657726049423218, + "num_tokens": 313093495.0, + "step": 8588 + }, + { + "epoch": 1.5949860724233984, + "grad_norm": 1.880291223526001, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.871108889579773, + "num_tokens": 313122875.0, + "step": 8589 + }, + { + "epoch": 1.595171773444754, + "grad_norm": 1.6050158739089966, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8826468586921692, + "num_tokens": 313160533.0, + "step": 8590 + }, + { + "epoch": 1.5953574744661094, + "grad_norm": 1.7089654207229614, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8694524168968201, + "num_tokens": 313194577.0, + "step": 8591 + }, + { + "epoch": 1.5955431754874652, + "grad_norm": 1.59602952003479, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8661413192749023, + "num_tokens": 313230981.0, + "step": 8592 + }, + { + "epoch": 1.595728876508821, + "grad_norm": 1.5655041933059692, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8663537502288818, + "num_tokens": 313266362.0, + "step": 8593 + }, + { + "epoch": 1.5959145775301764, + "grad_norm": 1.5578757524490356, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8713489770889282, + "num_tokens": 313304820.0, + "step": 8594 + }, + { + "epoch": 1.596100278551532, + "grad_norm": 1.490980625152588, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8853901624679565, + "num_tokens": 313342006.0, + "step": 8595 + }, + { + "epoch": 1.5962859795728876, + "grad_norm": 1.607094407081604, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.881172239780426, + "num_tokens": 313375176.0, + "step": 8596 + }, + { + "epoch": 1.5964716805942434, + "grad_norm": 1.4891071319580078, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8847033381462097, + "num_tokens": 313410905.0, + "step": 8597 + }, + { + "epoch": 1.596657381615599, + "grad_norm": 1.571203351020813, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8882734775543213, + "num_tokens": 313443557.0, + "step": 8598 + }, + { + "epoch": 1.5968430826369544, + "grad_norm": 1.5666438341140747, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8909124135971069, + "num_tokens": 313476794.0, + "step": 8599 + }, + { + "epoch": 1.5970287836583101, + "grad_norm": 1.4260599613189697, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8848787546157837, + "num_tokens": 313516817.0, + "step": 8600 + }, + { + "epoch": 1.5972144846796659, + "grad_norm": 1.558879017829895, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8830499649047852, + "num_tokens": 313550731.0, + "step": 8601 + }, + { + "epoch": 1.5974001857010214, + "grad_norm": 1.5985685586929321, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8659429550170898, + "num_tokens": 313584812.0, + "step": 8602 + }, + { + "epoch": 1.597585886722377, + "grad_norm": 1.758220911026001, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8743586540222168, + "num_tokens": 313616806.0, + "step": 8603 + }, + { + "epoch": 1.5977715877437326, + "grad_norm": 1.7270647287368774, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8817794919013977, + "num_tokens": 313648118.0, + "step": 8604 + }, + { + "epoch": 1.5979572887650884, + "grad_norm": 1.641870141029358, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8743083477020264, + "num_tokens": 313683041.0, + "step": 8605 + }, + { + "epoch": 1.5981429897864439, + "grad_norm": 1.5488238334655762, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8718518018722534, + "num_tokens": 313721808.0, + "step": 8606 + }, + { + "epoch": 1.5983286908077994, + "grad_norm": 1.4546657800674438, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8879279494285583, + "num_tokens": 313760497.0, + "step": 8607 + }, + { + "epoch": 1.5985143918291551, + "grad_norm": 1.444397211074829, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8791335225105286, + "num_tokens": 313799733.0, + "step": 8608 + }, + { + "epoch": 1.5987000928505106, + "grad_norm": 1.4679195880889893, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.889189600944519, + "num_tokens": 313834575.0, + "step": 8609 + }, + { + "epoch": 1.5988857938718661, + "grad_norm": 1.5345782041549683, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8697654008865356, + "num_tokens": 313874112.0, + "step": 8610 + }, + { + "epoch": 1.5990714948932219, + "grad_norm": 1.5145472288131714, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8811880350112915, + "num_tokens": 313913524.0, + "step": 8611 + }, + { + "epoch": 1.5992571959145776, + "grad_norm": 1.4760410785675049, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8714612722396851, + "num_tokens": 313953462.0, + "step": 8612 + }, + { + "epoch": 1.5994428969359331, + "grad_norm": 1.5138969421386719, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8768641948699951, + "num_tokens": 313989801.0, + "step": 8613 + }, + { + "epoch": 1.5996285979572886, + "grad_norm": 1.6103698015213013, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8772857189178467, + "num_tokens": 314023823.0, + "step": 8614 + }, + { + "epoch": 1.5998142989786444, + "grad_norm": 1.3964873552322388, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8787020444869995, + "num_tokens": 314064207.0, + "step": 8615 + }, + { + "epoch": 1.6, + "grad_norm": 1.3008848428726196, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8897814154624939, + "num_tokens": 314108514.0, + "step": 8616 + }, + { + "epoch": 1.6001857010213556, + "grad_norm": 1.4854810237884521, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.888819694519043, + "num_tokens": 314143383.0, + "step": 8617 + }, + { + "epoch": 1.600371402042711, + "grad_norm": 1.6235227584838867, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8938380479812622, + "num_tokens": 314173496.0, + "step": 8618 + }, + { + "epoch": 1.6005571030640668, + "grad_norm": 1.6631834506988525, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8819894790649414, + "num_tokens": 314204207.0, + "step": 8619 + }, + { + "epoch": 1.6007428040854226, + "grad_norm": 1.4369968175888062, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8869303464889526, + "num_tokens": 314243178.0, + "step": 8620 + }, + { + "epoch": 1.600928505106778, + "grad_norm": 1.524178147315979, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8812962174415588, + "num_tokens": 314278381.0, + "step": 8621 + }, + { + "epoch": 1.6011142061281336, + "grad_norm": 1.4655555486679077, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.885413408279419, + "num_tokens": 314318441.0, + "step": 8622 + }, + { + "epoch": 1.6012999071494893, + "grad_norm": 1.6267931461334229, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8812360763549805, + "num_tokens": 314351884.0, + "step": 8623 + }, + { + "epoch": 1.601485608170845, + "grad_norm": 1.4454511404037476, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8939586281776428, + "num_tokens": 314391588.0, + "step": 8624 + }, + { + "epoch": 1.6016713091922006, + "grad_norm": 1.6363136768341064, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8952425718307495, + "num_tokens": 314420721.0, + "step": 8625 + }, + { + "epoch": 1.601857010213556, + "grad_norm": 1.5679301023483276, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8840919733047485, + "num_tokens": 314455258.0, + "step": 8626 + }, + { + "epoch": 1.6020427112349118, + "grad_norm": 1.5227936506271362, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8767933249473572, + "num_tokens": 314492826.0, + "step": 8627 + }, + { + "epoch": 1.6022284122562676, + "grad_norm": 1.4348540306091309, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.889193058013916, + "num_tokens": 314532434.0, + "step": 8628 + }, + { + "epoch": 1.602414113277623, + "grad_norm": 1.5235981941223145, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8869192600250244, + "num_tokens": 314568351.0, + "step": 8629 + }, + { + "epoch": 1.6025998142989786, + "grad_norm": 1.6209490299224854, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8708317875862122, + "num_tokens": 314606801.0, + "step": 8630 + }, + { + "epoch": 1.6027855153203343, + "grad_norm": 1.6009999513626099, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.883579432964325, + "num_tokens": 314646095.0, + "step": 8631 + }, + { + "epoch": 1.60297121634169, + "grad_norm": 1.5783514976501465, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8671313524246216, + "num_tokens": 314681124.0, + "step": 8632 + }, + { + "epoch": 1.6031569173630453, + "grad_norm": 1.6143248081207275, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8841360807418823, + "num_tokens": 314713174.0, + "step": 8633 + }, + { + "epoch": 1.603342618384401, + "grad_norm": 1.4180837869644165, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8840759992599487, + "num_tokens": 314752828.0, + "step": 8634 + }, + { + "epoch": 1.6035283194057568, + "grad_norm": 1.6295546293258667, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8807569146156311, + "num_tokens": 314785169.0, + "step": 8635 + }, + { + "epoch": 1.6037140204271123, + "grad_norm": 1.4781713485717773, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.869246244430542, + "num_tokens": 314822672.0, + "step": 8636 + }, + { + "epoch": 1.6038997214484678, + "grad_norm": 1.5849875211715698, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8711819052696228, + "num_tokens": 314859521.0, + "step": 8637 + }, + { + "epoch": 1.6040854224698236, + "grad_norm": 1.488202452659607, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8879106044769287, + "num_tokens": 314892151.0, + "step": 8638 + }, + { + "epoch": 1.6042711234911793, + "grad_norm": 1.362396001815796, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8894299268722534, + "num_tokens": 314935051.0, + "step": 8639 + }, + { + "epoch": 1.6044568245125348, + "grad_norm": 1.4278010129928589, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.882144570350647, + "num_tokens": 314974776.0, + "step": 8640 + }, + { + "epoch": 1.6046425255338903, + "grad_norm": 1.4570245742797852, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8717093467712402, + "num_tokens": 315015090.0, + "step": 8641 + }, + { + "epoch": 1.604828226555246, + "grad_norm": 1.649526834487915, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8576356768608093, + "num_tokens": 315053519.0, + "step": 8642 + }, + { + "epoch": 1.6050139275766018, + "grad_norm": 1.4492344856262207, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.888556957244873, + "num_tokens": 315088922.0, + "step": 8643 + }, + { + "epoch": 1.6051996285979573, + "grad_norm": 1.517085075378418, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.878328263759613, + "num_tokens": 315127124.0, + "step": 8644 + }, + { + "epoch": 1.6053853296193128, + "grad_norm": 1.5105026960372925, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8721153736114502, + "num_tokens": 315169598.0, + "step": 8645 + }, + { + "epoch": 1.6055710306406685, + "grad_norm": 1.7161755561828613, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8667551875114441, + "num_tokens": 315202040.0, + "step": 8646 + }, + { + "epoch": 1.6057567316620243, + "grad_norm": 1.528952956199646, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.888923168182373, + "num_tokens": 315236012.0, + "step": 8647 + }, + { + "epoch": 1.6059424326833798, + "grad_norm": 1.6197178363800049, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8809453845024109, + "num_tokens": 315269948.0, + "step": 8648 + }, + { + "epoch": 1.6061281337047353, + "grad_norm": 1.6400355100631714, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8768031597137451, + "num_tokens": 315299577.0, + "step": 8649 + }, + { + "epoch": 1.606313834726091, + "grad_norm": 1.516236662864685, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8682461977005005, + "num_tokens": 315337728.0, + "step": 8650 + }, + { + "epoch": 1.6064995357474467, + "grad_norm": 1.4279035329818726, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.9004902243614197, + "num_tokens": 315374508.0, + "step": 8651 + }, + { + "epoch": 1.6066852367688023, + "grad_norm": 1.6346509456634521, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8750568628311157, + "num_tokens": 315408573.0, + "step": 8652 + }, + { + "epoch": 1.6068709377901578, + "grad_norm": 1.4389704465866089, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.882131814956665, + "num_tokens": 315448389.0, + "step": 8653 + }, + { + "epoch": 1.6070566388115135, + "grad_norm": 1.621524453163147, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8685766458511353, + "num_tokens": 315481572.0, + "step": 8654 + }, + { + "epoch": 1.6072423398328692, + "grad_norm": 1.411681056022644, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8699790239334106, + "num_tokens": 315525247.0, + "step": 8655 + }, + { + "epoch": 1.6074280408542247, + "grad_norm": 1.4871538877487183, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.879426121711731, + "num_tokens": 315565749.0, + "step": 8656 + }, + { + "epoch": 1.6076137418755803, + "grad_norm": 1.6477246284484863, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8821368217468262, + "num_tokens": 315599184.0, + "step": 8657 + }, + { + "epoch": 1.607799442896936, + "grad_norm": 1.7047607898712158, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8673825263977051, + "num_tokens": 315629761.0, + "step": 8658 + }, + { + "epoch": 1.6079851439182915, + "grad_norm": 1.4163509607315063, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8845110535621643, + "num_tokens": 315665780.0, + "step": 8659 + }, + { + "epoch": 1.608170844939647, + "grad_norm": 1.6468064785003662, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8776615262031555, + "num_tokens": 315694476.0, + "step": 8660 + }, + { + "epoch": 1.6083565459610027, + "grad_norm": 1.5299361944198608, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8722988367080688, + "num_tokens": 315732992.0, + "step": 8661 + }, + { + "epoch": 1.6085422469823585, + "grad_norm": 1.5888270139694214, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8709896802902222, + "num_tokens": 315767329.0, + "step": 8662 + }, + { + "epoch": 1.608727948003714, + "grad_norm": 1.4207992553710938, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8842655420303345, + "num_tokens": 315807781.0, + "step": 8663 + }, + { + "epoch": 1.6089136490250695, + "grad_norm": 1.5645986795425415, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8923379182815552, + "num_tokens": 315844027.0, + "step": 8664 + }, + { + "epoch": 1.6090993500464252, + "grad_norm": 1.5084660053253174, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8797851800918579, + "num_tokens": 315880397.0, + "step": 8665 + }, + { + "epoch": 1.609285051067781, + "grad_norm": 1.5939799547195435, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8904802799224854, + "num_tokens": 315914676.0, + "step": 8666 + }, + { + "epoch": 1.6094707520891365, + "grad_norm": 1.456908106803894, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8955427408218384, + "num_tokens": 315950984.0, + "step": 8667 + }, + { + "epoch": 1.609656453110492, + "grad_norm": 1.5447332859039307, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8883394002914429, + "num_tokens": 315982838.0, + "step": 8668 + }, + { + "epoch": 1.6098421541318477, + "grad_norm": 1.706933617591858, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8830313682556152, + "num_tokens": 316014357.0, + "step": 8669 + }, + { + "epoch": 1.6100278551532035, + "grad_norm": 1.7003432512283325, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8853278756141663, + "num_tokens": 316051369.0, + "step": 8670 + }, + { + "epoch": 1.610213556174559, + "grad_norm": 1.5249305963516235, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8634710311889648, + "num_tokens": 316093188.0, + "step": 8671 + }, + { + "epoch": 1.6103992571959145, + "grad_norm": 1.4413360357284546, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8902513980865479, + "num_tokens": 316129290.0, + "step": 8672 + }, + { + "epoch": 1.6105849582172702, + "grad_norm": 1.6326243877410889, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8720083832740784, + "num_tokens": 316164530.0, + "step": 8673 + }, + { + "epoch": 1.610770659238626, + "grad_norm": 1.491738200187683, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8863707184791565, + "num_tokens": 316201733.0, + "step": 8674 + }, + { + "epoch": 1.6109563602599815, + "grad_norm": 1.4788814783096313, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8921757936477661, + "num_tokens": 316237596.0, + "step": 8675 + }, + { + "epoch": 1.611142061281337, + "grad_norm": 1.6101725101470947, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8714787364006042, + "num_tokens": 316272943.0, + "step": 8676 + }, + { + "epoch": 1.6113277623026927, + "grad_norm": 1.4905412197113037, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8678446412086487, + "num_tokens": 316314178.0, + "step": 8677 + }, + { + "epoch": 1.6115134633240484, + "grad_norm": 1.6640534400939941, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8918904662132263, + "num_tokens": 316344590.0, + "step": 8678 + }, + { + "epoch": 1.611699164345404, + "grad_norm": 1.504111647605896, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8831262588500977, + "num_tokens": 316381015.0, + "step": 8679 + }, + { + "epoch": 1.6118848653667595, + "grad_norm": 1.6507155895233154, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8794717192649841, + "num_tokens": 316415263.0, + "step": 8680 + }, + { + "epoch": 1.6120705663881152, + "grad_norm": 1.671572208404541, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8807470798492432, + "num_tokens": 316449269.0, + "step": 8681 + }, + { + "epoch": 1.6122562674094707, + "grad_norm": 1.5300782918930054, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8795385360717773, + "num_tokens": 316485074.0, + "step": 8682 + }, + { + "epoch": 1.6124419684308262, + "grad_norm": 1.567663550376892, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8661368489265442, + "num_tokens": 316524598.0, + "step": 8683 + }, + { + "epoch": 1.612627669452182, + "grad_norm": 1.5248364210128784, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8792890310287476, + "num_tokens": 316563964.0, + "step": 8684 + }, + { + "epoch": 1.6128133704735377, + "grad_norm": 1.5303531885147095, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8789074420928955, + "num_tokens": 316601156.0, + "step": 8685 + }, + { + "epoch": 1.6129990714948932, + "grad_norm": 1.613939881324768, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8635634183883667, + "num_tokens": 316642746.0, + "step": 8686 + }, + { + "epoch": 1.6131847725162487, + "grad_norm": 1.6841415166854858, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8851883411407471, + "num_tokens": 316674676.0, + "step": 8687 + }, + { + "epoch": 1.6133704735376044, + "grad_norm": 1.769912600517273, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8760245442390442, + "num_tokens": 316705711.0, + "step": 8688 + }, + { + "epoch": 1.6135561745589602, + "grad_norm": 1.6357195377349854, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8753236532211304, + "num_tokens": 316739467.0, + "step": 8689 + }, + { + "epoch": 1.6137418755803157, + "grad_norm": 1.5508127212524414, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8712378144264221, + "num_tokens": 316776049.0, + "step": 8690 + }, + { + "epoch": 1.6139275766016712, + "grad_norm": 1.4346152544021606, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8612666726112366, + "num_tokens": 316819944.0, + "step": 8691 + }, + { + "epoch": 1.614113277623027, + "grad_norm": 1.7158654928207397, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8552314639091492, + "num_tokens": 316852403.0, + "step": 8692 + }, + { + "epoch": 1.6142989786443827, + "grad_norm": 1.6339893341064453, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8758134245872498, + "num_tokens": 316887217.0, + "step": 8693 + }, + { + "epoch": 1.6144846796657382, + "grad_norm": 1.5762825012207031, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8725310564041138, + "num_tokens": 316925040.0, + "step": 8694 + }, + { + "epoch": 1.6146703806870937, + "grad_norm": 1.6268573999404907, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.886182427406311, + "num_tokens": 316955570.0, + "step": 8695 + }, + { + "epoch": 1.6148560817084494, + "grad_norm": 1.44132661819458, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8650305271148682, + "num_tokens": 316999480.0, + "step": 8696 + }, + { + "epoch": 1.6150417827298051, + "grad_norm": 1.4317059516906738, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8770668506622314, + "num_tokens": 317040589.0, + "step": 8697 + }, + { + "epoch": 1.6152274837511607, + "grad_norm": 1.5878410339355469, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8815597295761108, + "num_tokens": 317072018.0, + "step": 8698 + }, + { + "epoch": 1.6154131847725162, + "grad_norm": 1.4991881847381592, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8805500268936157, + "num_tokens": 317109561.0, + "step": 8699 + }, + { + "epoch": 1.615598885793872, + "grad_norm": 1.583884596824646, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8781101107597351, + "num_tokens": 317142101.0, + "step": 8700 + }, + { + "epoch": 1.6157845868152276, + "grad_norm": 1.5992355346679688, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8738073110580444, + "num_tokens": 317177329.0, + "step": 8701 + }, + { + "epoch": 1.6159702878365831, + "grad_norm": 1.466942548751831, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8693169951438904, + "num_tokens": 317217214.0, + "step": 8702 + }, + { + "epoch": 1.6161559888579387, + "grad_norm": 1.5959810018539429, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8861032128334045, + "num_tokens": 317250511.0, + "step": 8703 + }, + { + "epoch": 1.6163416898792944, + "grad_norm": 1.3701974153518677, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8931955695152283, + "num_tokens": 317290882.0, + "step": 8704 + }, + { + "epoch": 1.6165273909006501, + "grad_norm": 1.50121009349823, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8747920989990234, + "num_tokens": 317330531.0, + "step": 8705 + }, + { + "epoch": 1.6167130919220054, + "grad_norm": 1.4741445779800415, + "learning_rate": 1e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.8985863327980042, + "num_tokens": 317364148.0, + "step": 8706 + }, + { + "epoch": 1.6168987929433611, + "grad_norm": 1.3423147201538086, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8867464661598206, + "num_tokens": 317405336.0, + "step": 8707 + }, + { + "epoch": 1.6170844939647169, + "grad_norm": 1.5239801406860352, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8762309551239014, + "num_tokens": 317439985.0, + "step": 8708 + }, + { + "epoch": 1.6172701949860724, + "grad_norm": 1.5902663469314575, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8691925406455994, + "num_tokens": 317476007.0, + "step": 8709 + }, + { + "epoch": 1.617455896007428, + "grad_norm": 1.4919759035110474, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8715667128562927, + "num_tokens": 317514405.0, + "step": 8710 + }, + { + "epoch": 1.6176415970287836, + "grad_norm": 1.6617329120635986, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8707459568977356, + "num_tokens": 317544060.0, + "step": 8711 + }, + { + "epoch": 1.6178272980501394, + "grad_norm": 1.6226900815963745, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8805533647537231, + "num_tokens": 317575766.0, + "step": 8712 + }, + { + "epoch": 1.6180129990714949, + "grad_norm": 1.4663803577423096, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8706234097480774, + "num_tokens": 317615968.0, + "step": 8713 + }, + { + "epoch": 1.6181987000928504, + "grad_norm": 1.414362907409668, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8954457640647888, + "num_tokens": 317651660.0, + "step": 8714 + }, + { + "epoch": 1.6183844011142061, + "grad_norm": 1.6095229387283325, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8842514753341675, + "num_tokens": 317682224.0, + "step": 8715 + }, + { + "epoch": 1.6185701021355619, + "grad_norm": 1.5349656343460083, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8866665363311768, + "num_tokens": 317719664.0, + "step": 8716 + }, + { + "epoch": 1.6187558031569174, + "grad_norm": 1.4405560493469238, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8891232013702393, + "num_tokens": 317756861.0, + "step": 8717 + }, + { + "epoch": 1.6189415041782729, + "grad_norm": 1.525351881980896, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8651319742202759, + "num_tokens": 317794971.0, + "step": 8718 + }, + { + "epoch": 1.6191272051996286, + "grad_norm": 1.6607441902160645, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8779535293579102, + "num_tokens": 317825600.0, + "step": 8719 + }, + { + "epoch": 1.6193129062209843, + "grad_norm": 1.6115238666534424, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8782368898391724, + "num_tokens": 317859236.0, + "step": 8720 + }, + { + "epoch": 1.6194986072423398, + "grad_norm": 1.44072425365448, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8758606910705566, + "num_tokens": 317896871.0, + "step": 8721 + }, + { + "epoch": 1.6196843082636954, + "grad_norm": 1.5312429666519165, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8731398582458496, + "num_tokens": 317938683.0, + "step": 8722 + }, + { + "epoch": 1.619870009285051, + "grad_norm": 1.4828065633773804, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8805844783782959, + "num_tokens": 317977444.0, + "step": 8723 + }, + { + "epoch": 1.6200557103064068, + "grad_norm": 1.5294616222381592, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8796471357345581, + "num_tokens": 318016117.0, + "step": 8724 + }, + { + "epoch": 1.6202414113277623, + "grad_norm": 1.5851389169692993, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8558440208435059, + "num_tokens": 318053901.0, + "step": 8725 + }, + { + "epoch": 1.6204271123491178, + "grad_norm": 1.4291788339614868, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.888151228427887, + "num_tokens": 318093725.0, + "step": 8726 + }, + { + "epoch": 1.6206128133704736, + "grad_norm": 1.5115777254104614, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.870236873626709, + "num_tokens": 318130446.0, + "step": 8727 + }, + { + "epoch": 1.6207985143918293, + "grad_norm": 1.4164233207702637, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8874238133430481, + "num_tokens": 318171781.0, + "step": 8728 + }, + { + "epoch": 1.6209842154131848, + "grad_norm": 1.5622915029525757, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8591045141220093, + "num_tokens": 318211985.0, + "step": 8729 + }, + { + "epoch": 1.6211699164345403, + "grad_norm": 1.3800091743469238, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.87649005651474, + "num_tokens": 318253474.0, + "step": 8730 + }, + { + "epoch": 1.621355617455896, + "grad_norm": 1.5997544527053833, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8903251886367798, + "num_tokens": 318288108.0, + "step": 8731 + }, + { + "epoch": 1.6215413184772516, + "grad_norm": 1.7034927606582642, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8738207221031189, + "num_tokens": 318317538.0, + "step": 8732 + }, + { + "epoch": 1.621727019498607, + "grad_norm": 1.4895504713058472, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8890485167503357, + "num_tokens": 318355499.0, + "step": 8733 + }, + { + "epoch": 1.6219127205199628, + "grad_norm": 1.6122061014175415, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8702815175056458, + "num_tokens": 318391021.0, + "step": 8734 + }, + { + "epoch": 1.6220984215413186, + "grad_norm": 1.5270357131958008, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8704178929328918, + "num_tokens": 318430145.0, + "step": 8735 + }, + { + "epoch": 1.622284122562674, + "grad_norm": 1.4648725986480713, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8875531554222107, + "num_tokens": 318466695.0, + "step": 8736 + }, + { + "epoch": 1.6224698235840296, + "grad_norm": 1.6799428462982178, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8690478801727295, + "num_tokens": 318499977.0, + "step": 8737 + }, + { + "epoch": 1.6226555246053853, + "grad_norm": 1.6180588006973267, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8640579581260681, + "num_tokens": 318536801.0, + "step": 8738 + }, + { + "epoch": 1.622841225626741, + "grad_norm": 1.5832833051681519, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8774752020835876, + "num_tokens": 318573038.0, + "step": 8739 + }, + { + "epoch": 1.6230269266480966, + "grad_norm": 1.53805410861969, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8725461363792419, + "num_tokens": 318609620.0, + "step": 8740 + }, + { + "epoch": 1.623212627669452, + "grad_norm": 1.7007356882095337, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8624316453933716, + "num_tokens": 318641580.0, + "step": 8741 + }, + { + "epoch": 1.6233983286908078, + "grad_norm": 1.4787663221359253, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8704541325569153, + "num_tokens": 318682091.0, + "step": 8742 + }, + { + "epoch": 1.6235840297121635, + "grad_norm": 1.473313570022583, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8777599334716797, + "num_tokens": 318717423.0, + "step": 8743 + }, + { + "epoch": 1.623769730733519, + "grad_norm": 1.4487465620040894, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8802518844604492, + "num_tokens": 318755968.0, + "step": 8744 + }, + { + "epoch": 1.6239554317548746, + "grad_norm": 1.623579978942871, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8892480134963989, + "num_tokens": 318786009.0, + "step": 8745 + }, + { + "epoch": 1.6241411327762303, + "grad_norm": 1.6890869140625, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8616218566894531, + "num_tokens": 318818807.0, + "step": 8746 + }, + { + "epoch": 1.624326833797586, + "grad_norm": 1.7120859622955322, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8657952547073364, + "num_tokens": 318852483.0, + "step": 8747 + }, + { + "epoch": 1.6245125348189415, + "grad_norm": 1.4897888898849487, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8652940988540649, + "num_tokens": 318892316.0, + "step": 8748 + }, + { + "epoch": 1.624698235840297, + "grad_norm": 1.6449226140975952, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8776146173477173, + "num_tokens": 318926349.0, + "step": 8749 + }, + { + "epoch": 1.6248839368616528, + "grad_norm": 1.6209397315979004, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8675270080566406, + "num_tokens": 318962757.0, + "step": 8750 + }, + { + "epoch": 1.6250696378830085, + "grad_norm": 1.5126773118972778, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8701522946357727, + "num_tokens": 319001777.0, + "step": 8751 + }, + { + "epoch": 1.625255338904364, + "grad_norm": 1.501922607421875, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8908886909484863, + "num_tokens": 319036947.0, + "step": 8752 + }, + { + "epoch": 1.6254410399257195, + "grad_norm": 1.57886803150177, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8692702651023865, + "num_tokens": 319073488.0, + "step": 8753 + }, + { + "epoch": 1.6256267409470753, + "grad_norm": 1.4664521217346191, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8821265697479248, + "num_tokens": 319113546.0, + "step": 8754 + }, + { + "epoch": 1.6258124419684308, + "grad_norm": 1.6743528842926025, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8670077323913574, + "num_tokens": 319145719.0, + "step": 8755 + }, + { + "epoch": 1.6259981429897863, + "grad_norm": 1.5739351511001587, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8838990330696106, + "num_tokens": 319181277.0, + "step": 8756 + }, + { + "epoch": 1.626183844011142, + "grad_norm": 1.4558452367782593, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8654414415359497, + "num_tokens": 319223792.0, + "step": 8757 + }, + { + "epoch": 1.6263695450324978, + "grad_norm": 1.4066319465637207, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.881196916103363, + "num_tokens": 319264412.0, + "step": 8758 + }, + { + "epoch": 1.6265552460538533, + "grad_norm": 1.4983834028244019, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8865729570388794, + "num_tokens": 319304994.0, + "step": 8759 + }, + { + "epoch": 1.6267409470752088, + "grad_norm": 1.5518298149108887, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8878566026687622, + "num_tokens": 319339058.0, + "step": 8760 + }, + { + "epoch": 1.6269266480965645, + "grad_norm": 1.5283818244934082, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8738284111022949, + "num_tokens": 319377169.0, + "step": 8761 + }, + { + "epoch": 1.6271123491179202, + "grad_norm": 1.4271601438522339, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8861448764801025, + "num_tokens": 319414767.0, + "step": 8762 + }, + { + "epoch": 1.6272980501392758, + "grad_norm": 1.7691398859024048, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8809777498245239, + "num_tokens": 319446726.0, + "step": 8763 + }, + { + "epoch": 1.6274837511606313, + "grad_norm": 1.6459860801696777, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8564935922622681, + "num_tokens": 319485830.0, + "step": 8764 + }, + { + "epoch": 1.627669452181987, + "grad_norm": 1.590343713760376, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.88286954164505, + "num_tokens": 319521200.0, + "step": 8765 + }, + { + "epoch": 1.6278551532033427, + "grad_norm": 1.6003129482269287, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8573602437973022, + "num_tokens": 319558375.0, + "step": 8766 + }, + { + "epoch": 1.6280408542246982, + "grad_norm": 1.698742151260376, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.877171516418457, + "num_tokens": 319592395.0, + "step": 8767 + }, + { + "epoch": 1.6282265552460538, + "grad_norm": 1.6700924634933472, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8728749752044678, + "num_tokens": 319629694.0, + "step": 8768 + }, + { + "epoch": 1.6284122562674095, + "grad_norm": 1.6184272766113281, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8920176029205322, + "num_tokens": 319664482.0, + "step": 8769 + }, + { + "epoch": 1.6285979572887652, + "grad_norm": 1.677744746208191, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8832478523254395, + "num_tokens": 319693136.0, + "step": 8770 + }, + { + "epoch": 1.6287836583101207, + "grad_norm": 1.6037509441375732, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8861207365989685, + "num_tokens": 319728739.0, + "step": 8771 + }, + { + "epoch": 1.6289693593314762, + "grad_norm": 1.6348825693130493, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8739506602287292, + "num_tokens": 319763126.0, + "step": 8772 + }, + { + "epoch": 1.629155060352832, + "grad_norm": 1.6292113065719604, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8843508958816528, + "num_tokens": 319798883.0, + "step": 8773 + }, + { + "epoch": 1.6293407613741877, + "grad_norm": 1.6175323724746704, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.866380512714386, + "num_tokens": 319837187.0, + "step": 8774 + }, + { + "epoch": 1.6295264623955432, + "grad_norm": 1.71251380443573, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8646052479743958, + "num_tokens": 319870897.0, + "step": 8775 + }, + { + "epoch": 1.6297121634168987, + "grad_norm": 1.4843682050704956, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8763421773910522, + "num_tokens": 319908181.0, + "step": 8776 + }, + { + "epoch": 1.6298978644382545, + "grad_norm": 1.6272543668746948, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8748195171356201, + "num_tokens": 319942518.0, + "step": 8777 + }, + { + "epoch": 1.63008356545961, + "grad_norm": 1.6307134628295898, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8897174596786499, + "num_tokens": 319974387.0, + "step": 8778 + }, + { + "epoch": 1.6302692664809655, + "grad_norm": 1.3928160667419434, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8858586549758911, + "num_tokens": 320013720.0, + "step": 8779 + }, + { + "epoch": 1.6304549675023212, + "grad_norm": 1.5564535856246948, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8785247802734375, + "num_tokens": 320047675.0, + "step": 8780 + }, + { + "epoch": 1.630640668523677, + "grad_norm": 1.4176234006881714, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8699723482131958, + "num_tokens": 320090973.0, + "step": 8781 + }, + { + "epoch": 1.6308263695450325, + "grad_norm": 1.389715313911438, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8865277171134949, + "num_tokens": 320132913.0, + "step": 8782 + }, + { + "epoch": 1.631012070566388, + "grad_norm": 1.4958723783493042, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8828136324882507, + "num_tokens": 320169252.0, + "step": 8783 + }, + { + "epoch": 1.6311977715877437, + "grad_norm": 1.4579576253890991, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8801718354225159, + "num_tokens": 320209080.0, + "step": 8784 + }, + { + "epoch": 1.6313834726090994, + "grad_norm": 1.4784233570098877, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.874192476272583, + "num_tokens": 320248877.0, + "step": 8785 + }, + { + "epoch": 1.631569173630455, + "grad_norm": 1.5233383178710938, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8800315856933594, + "num_tokens": 320285350.0, + "step": 8786 + }, + { + "epoch": 1.6317548746518105, + "grad_norm": 1.5290335416793823, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8687893152236938, + "num_tokens": 320325450.0, + "step": 8787 + }, + { + "epoch": 1.6319405756731662, + "grad_norm": 1.4492435455322266, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8889297246932983, + "num_tokens": 320364350.0, + "step": 8788 + }, + { + "epoch": 1.632126276694522, + "grad_norm": 1.4066359996795654, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8702893257141113, + "num_tokens": 320406974.0, + "step": 8789 + }, + { + "epoch": 1.6323119777158774, + "grad_norm": 1.6490287780761719, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8861677646636963, + "num_tokens": 320438880.0, + "step": 8790 + }, + { + "epoch": 1.632497678737233, + "grad_norm": 1.5550190210342407, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8561236262321472, + "num_tokens": 320474529.0, + "step": 8791 + }, + { + "epoch": 1.6326833797585887, + "grad_norm": 1.441542148590088, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8777680993080139, + "num_tokens": 320515174.0, + "step": 8792 + }, + { + "epoch": 1.6328690807799444, + "grad_norm": 1.4693269729614258, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.883689284324646, + "num_tokens": 320552445.0, + "step": 8793 + }, + { + "epoch": 1.6330547818013, + "grad_norm": 1.6349375247955322, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8742272853851318, + "num_tokens": 320587095.0, + "step": 8794 + }, + { + "epoch": 1.6332404828226554, + "grad_norm": 1.5218286514282227, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8740159869194031, + "num_tokens": 320625579.0, + "step": 8795 + }, + { + "epoch": 1.6334261838440112, + "grad_norm": 1.495357632637024, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8854619860649109, + "num_tokens": 320661605.0, + "step": 8796 + }, + { + "epoch": 1.633611884865367, + "grad_norm": 1.5430315732955933, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.88230961561203, + "num_tokens": 320699401.0, + "step": 8797 + }, + { + "epoch": 1.6337975858867224, + "grad_norm": 1.5135293006896973, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8849868774414062, + "num_tokens": 320731876.0, + "step": 8798 + }, + { + "epoch": 1.633983286908078, + "grad_norm": 1.4956159591674805, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8839846849441528, + "num_tokens": 320768003.0, + "step": 8799 + }, + { + "epoch": 1.6341689879294337, + "grad_norm": 1.4515811204910278, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8748565912246704, + "num_tokens": 320807422.0, + "step": 8800 + }, + { + "epoch": 1.6343546889507894, + "grad_norm": 1.6722935438156128, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8711381554603577, + "num_tokens": 320841213.0, + "step": 8801 + }, + { + "epoch": 1.6345403899721447, + "grad_norm": 1.4493910074234009, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.889029860496521, + "num_tokens": 320879749.0, + "step": 8802 + }, + { + "epoch": 1.6347260909935004, + "grad_norm": 1.6355093717575073, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8870391249656677, + "num_tokens": 320910857.0, + "step": 8803 + }, + { + "epoch": 1.6349117920148561, + "grad_norm": 1.3795164823532104, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.889340877532959, + "num_tokens": 320949615.0, + "step": 8804 + }, + { + "epoch": 1.6350974930362117, + "grad_norm": 1.6231187582015991, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8778288960456848, + "num_tokens": 320984513.0, + "step": 8805 + }, + { + "epoch": 1.6352831940575672, + "grad_norm": 1.5818076133728027, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8687615394592285, + "num_tokens": 321020006.0, + "step": 8806 + }, + { + "epoch": 1.635468895078923, + "grad_norm": 1.3898504972457886, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8874814510345459, + "num_tokens": 321059471.0, + "step": 8807 + }, + { + "epoch": 1.6356545961002786, + "grad_norm": 1.5969606637954712, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8754962682723999, + "num_tokens": 321096976.0, + "step": 8808 + }, + { + "epoch": 1.6358402971216341, + "grad_norm": 1.4824215173721313, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8805683851242065, + "num_tokens": 321136999.0, + "step": 8809 + }, + { + "epoch": 1.6360259981429897, + "grad_norm": 1.7597603797912598, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8654292821884155, + "num_tokens": 321170239.0, + "step": 8810 + }, + { + "epoch": 1.6362116991643454, + "grad_norm": 1.5196613073349, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8877993226051331, + "num_tokens": 321207573.0, + "step": 8811 + }, + { + "epoch": 1.6363974001857011, + "grad_norm": 1.4811710119247437, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8760973215103149, + "num_tokens": 321247300.0, + "step": 8812 + }, + { + "epoch": 1.6365831012070566, + "grad_norm": 1.596477746963501, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8756920695304871, + "num_tokens": 321283989.0, + "step": 8813 + }, + { + "epoch": 1.6367688022284121, + "grad_norm": 1.5642954111099243, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8644160032272339, + "num_tokens": 321325261.0, + "step": 8814 + }, + { + "epoch": 1.6369545032497679, + "grad_norm": 1.4886845350265503, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8717280626296997, + "num_tokens": 321362917.0, + "step": 8815 + }, + { + "epoch": 1.6371402042711236, + "grad_norm": 1.4410109519958496, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8878832459449768, + "num_tokens": 321401151.0, + "step": 8816 + }, + { + "epoch": 1.6373259052924791, + "grad_norm": 1.4266051054000854, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8824296593666077, + "num_tokens": 321440522.0, + "step": 8817 + }, + { + "epoch": 1.6375116063138346, + "grad_norm": 1.5611292123794556, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8790674209594727, + "num_tokens": 321474824.0, + "step": 8818 + }, + { + "epoch": 1.6376973073351904, + "grad_norm": 1.5780996084213257, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8792606592178345, + "num_tokens": 321509615.0, + "step": 8819 + }, + { + "epoch": 1.637883008356546, + "grad_norm": 1.5107088088989258, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8880572319030762, + "num_tokens": 321541707.0, + "step": 8820 + }, + { + "epoch": 1.6380687093779016, + "grad_norm": 1.4843661785125732, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8740463852882385, + "num_tokens": 321583358.0, + "step": 8821 + }, + { + "epoch": 1.6382544103992571, + "grad_norm": 1.7453079223632812, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8604683876037598, + "num_tokens": 321614187.0, + "step": 8822 + }, + { + "epoch": 1.6384401114206129, + "grad_norm": 1.4447646141052246, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8796777725219727, + "num_tokens": 321650700.0, + "step": 8823 + }, + { + "epoch": 1.6386258124419686, + "grad_norm": 1.535300374031067, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8724794983863831, + "num_tokens": 321686236.0, + "step": 8824 + }, + { + "epoch": 1.638811513463324, + "grad_norm": 1.6163949966430664, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8786325454711914, + "num_tokens": 321722468.0, + "step": 8825 + }, + { + "epoch": 1.6389972144846796, + "grad_norm": 1.6457256078720093, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8940700888633728, + "num_tokens": 321753009.0, + "step": 8826 + }, + { + "epoch": 1.6391829155060353, + "grad_norm": 1.5417617559432983, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8764547109603882, + "num_tokens": 321791088.0, + "step": 8827 + }, + { + "epoch": 1.6393686165273909, + "grad_norm": 1.6352863311767578, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8967390656471252, + "num_tokens": 321820448.0, + "step": 8828 + }, + { + "epoch": 1.6395543175487464, + "grad_norm": 1.408661961555481, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8882448673248291, + "num_tokens": 321858180.0, + "step": 8829 + }, + { + "epoch": 1.639740018570102, + "grad_norm": 1.6469913721084595, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8732595443725586, + "num_tokens": 321891380.0, + "step": 8830 + }, + { + "epoch": 1.6399257195914578, + "grad_norm": 1.6222649812698364, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8759205937385559, + "num_tokens": 321927395.0, + "step": 8831 + }, + { + "epoch": 1.6401114206128133, + "grad_norm": 1.6064428091049194, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8663899898529053, + "num_tokens": 321963880.0, + "step": 8832 + }, + { + "epoch": 1.6402971216341689, + "grad_norm": 1.483449101448059, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8759699463844299, + "num_tokens": 322004739.0, + "step": 8833 + }, + { + "epoch": 1.6404828226555246, + "grad_norm": 1.5612592697143555, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8689665198326111, + "num_tokens": 322042071.0, + "step": 8834 + }, + { + "epoch": 1.6406685236768803, + "grad_norm": 1.6705971956253052, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8696841597557068, + "num_tokens": 322072769.0, + "step": 8835 + }, + { + "epoch": 1.6408542246982358, + "grad_norm": 1.5766916275024414, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8797050714492798, + "num_tokens": 322109113.0, + "step": 8836 + }, + { + "epoch": 1.6410399257195913, + "grad_norm": 1.4836372137069702, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8814484477043152, + "num_tokens": 322148256.0, + "step": 8837 + }, + { + "epoch": 1.641225626740947, + "grad_norm": 1.4788461923599243, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8822202086448669, + "num_tokens": 322184396.0, + "step": 8838 + }, + { + "epoch": 1.6414113277623028, + "grad_norm": 1.7087593078613281, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8753595352172852, + "num_tokens": 322218347.0, + "step": 8839 + }, + { + "epoch": 1.6415970287836583, + "grad_norm": 1.4506351947784424, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8816664218902588, + "num_tokens": 322256086.0, + "step": 8840 + }, + { + "epoch": 1.6417827298050138, + "grad_norm": 1.4750359058380127, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.891571044921875, + "num_tokens": 322292416.0, + "step": 8841 + }, + { + "epoch": 1.6419684308263696, + "grad_norm": 1.5327122211456299, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.869140625, + "num_tokens": 322332098.0, + "step": 8842 + }, + { + "epoch": 1.6421541318477253, + "grad_norm": 1.4665473699569702, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.862993597984314, + "num_tokens": 322377925.0, + "step": 8843 + }, + { + "epoch": 1.6423398328690808, + "grad_norm": 1.7525871992111206, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8622297644615173, + "num_tokens": 322408925.0, + "step": 8844 + }, + { + "epoch": 1.6425255338904363, + "grad_norm": 1.5870906114578247, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8708381652832031, + "num_tokens": 322443389.0, + "step": 8845 + }, + { + "epoch": 1.642711234911792, + "grad_norm": 1.6420025825500488, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8759734630584717, + "num_tokens": 322476384.0, + "step": 8846 + }, + { + "epoch": 1.6428969359331478, + "grad_norm": 1.5071630477905273, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8882972002029419, + "num_tokens": 322510391.0, + "step": 8847 + }, + { + "epoch": 1.6430826369545033, + "grad_norm": 1.5022022724151611, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8782674074172974, + "num_tokens": 322548709.0, + "step": 8848 + }, + { + "epoch": 1.6432683379758588, + "grad_norm": 1.489499568939209, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8893998861312866, + "num_tokens": 322583603.0, + "step": 8849 + }, + { + "epoch": 1.6434540389972145, + "grad_norm": 1.3955893516540527, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8926503658294678, + "num_tokens": 322620610.0, + "step": 8850 + }, + { + "epoch": 1.64363974001857, + "grad_norm": 1.5247303247451782, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8607631921768188, + "num_tokens": 322662990.0, + "step": 8851 + }, + { + "epoch": 1.6438254410399256, + "grad_norm": 1.5692756175994873, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8913163542747498, + "num_tokens": 322695521.0, + "step": 8852 + }, + { + "epoch": 1.6440111420612813, + "grad_norm": 1.594135046005249, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8818259239196777, + "num_tokens": 322728234.0, + "step": 8853 + }, + { + "epoch": 1.644196843082637, + "grad_norm": 1.7080814838409424, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8707690238952637, + "num_tokens": 322760818.0, + "step": 8854 + }, + { + "epoch": 1.6443825441039925, + "grad_norm": 1.7371829748153687, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8738650679588318, + "num_tokens": 322796083.0, + "step": 8855 + }, + { + "epoch": 1.644568245125348, + "grad_norm": 1.5862610340118408, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8793075084686279, + "num_tokens": 322834220.0, + "step": 8856 + }, + { + "epoch": 1.6447539461467038, + "grad_norm": 1.544766902923584, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8813703656196594, + "num_tokens": 322872889.0, + "step": 8857 + }, + { + "epoch": 1.6449396471680595, + "grad_norm": 1.5744130611419678, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8899447321891785, + "num_tokens": 322907449.0, + "step": 8858 + }, + { + "epoch": 1.645125348189415, + "grad_norm": 1.5804457664489746, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8588638305664062, + "num_tokens": 322948089.0, + "step": 8859 + }, + { + "epoch": 1.6453110492107705, + "grad_norm": 1.538340449333191, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8827500343322754, + "num_tokens": 322984181.0, + "step": 8860 + }, + { + "epoch": 1.6454967502321263, + "grad_norm": 1.441554307937622, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8880152702331543, + "num_tokens": 323027713.0, + "step": 8861 + }, + { + "epoch": 1.645682451253482, + "grad_norm": 1.70075261592865, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8847227096557617, + "num_tokens": 323055267.0, + "step": 8862 + }, + { + "epoch": 1.6458681522748375, + "grad_norm": 1.443634271621704, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8831648230552673, + "num_tokens": 323094805.0, + "step": 8863 + }, + { + "epoch": 1.646053853296193, + "grad_norm": 1.460326910018921, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8865829706192017, + "num_tokens": 323133216.0, + "step": 8864 + }, + { + "epoch": 1.6462395543175488, + "grad_norm": 1.4523887634277344, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8827218413352966, + "num_tokens": 323176536.0, + "step": 8865 + }, + { + "epoch": 1.6464252553389045, + "grad_norm": 1.669875144958496, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8740665912628174, + "num_tokens": 323208899.0, + "step": 8866 + }, + { + "epoch": 1.64661095636026, + "grad_norm": 1.403230905532837, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8875798583030701, + "num_tokens": 323250559.0, + "step": 8867 + }, + { + "epoch": 1.6467966573816155, + "grad_norm": 1.597835659980774, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.870826005935669, + "num_tokens": 323287778.0, + "step": 8868 + }, + { + "epoch": 1.6469823584029712, + "grad_norm": 1.5787115097045898, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8836202025413513, + "num_tokens": 323322056.0, + "step": 8869 + }, + { + "epoch": 1.647168059424327, + "grad_norm": 1.5825942754745483, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8790888786315918, + "num_tokens": 323363617.0, + "step": 8870 + }, + { + "epoch": 1.6473537604456825, + "grad_norm": 1.6093406677246094, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8812435865402222, + "num_tokens": 323398036.0, + "step": 8871 + }, + { + "epoch": 1.647539461467038, + "grad_norm": 1.5532326698303223, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8875270485877991, + "num_tokens": 323432494.0, + "step": 8872 + }, + { + "epoch": 1.6477251624883937, + "grad_norm": 1.5192301273345947, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.890436053276062, + "num_tokens": 323464764.0, + "step": 8873 + }, + { + "epoch": 1.6479108635097495, + "grad_norm": 1.5427970886230469, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8730463981628418, + "num_tokens": 323501102.0, + "step": 8874 + }, + { + "epoch": 1.6480965645311048, + "grad_norm": 1.5135055780410767, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8884584307670593, + "num_tokens": 323537591.0, + "step": 8875 + }, + { + "epoch": 1.6482822655524605, + "grad_norm": 1.5260978937149048, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8823907375335693, + "num_tokens": 323574409.0, + "step": 8876 + }, + { + "epoch": 1.6484679665738162, + "grad_norm": 1.531761646270752, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8732901215553284, + "num_tokens": 323612189.0, + "step": 8877 + }, + { + "epoch": 1.6486536675951717, + "grad_norm": 1.4855687618255615, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8779168128967285, + "num_tokens": 323652484.0, + "step": 8878 + }, + { + "epoch": 1.6488393686165272, + "grad_norm": 1.535774827003479, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8792184591293335, + "num_tokens": 323688425.0, + "step": 8879 + }, + { + "epoch": 1.649025069637883, + "grad_norm": 1.61819326877594, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8867458701133728, + "num_tokens": 323722893.0, + "step": 8880 + }, + { + "epoch": 1.6492107706592387, + "grad_norm": 1.5504931211471558, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8837474584579468, + "num_tokens": 323757040.0, + "step": 8881 + }, + { + "epoch": 1.6493964716805942, + "grad_norm": 1.6346683502197266, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8809837102890015, + "num_tokens": 323789216.0, + "step": 8882 + }, + { + "epoch": 1.6495821727019497, + "grad_norm": 1.5892958641052246, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8691843748092651, + "num_tokens": 323828926.0, + "step": 8883 + }, + { + "epoch": 1.6497678737233055, + "grad_norm": 1.4537745714187622, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8618990778923035, + "num_tokens": 323868350.0, + "step": 8884 + }, + { + "epoch": 1.6499535747446612, + "grad_norm": 1.5836751461029053, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8786563873291016, + "num_tokens": 323905027.0, + "step": 8885 + }, + { + "epoch": 1.6501392757660167, + "grad_norm": 1.5078935623168945, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8800444602966309, + "num_tokens": 323940130.0, + "step": 8886 + }, + { + "epoch": 1.6503249767873722, + "grad_norm": 1.6522417068481445, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8574856519699097, + "num_tokens": 323975788.0, + "step": 8887 + }, + { + "epoch": 1.650510677808728, + "grad_norm": 1.6532799005508423, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8807864785194397, + "num_tokens": 324006144.0, + "step": 8888 + }, + { + "epoch": 1.6506963788300837, + "grad_norm": 1.534764289855957, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8872997760772705, + "num_tokens": 324043769.0, + "step": 8889 + }, + { + "epoch": 1.6508820798514392, + "grad_norm": 1.6125298738479614, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.886421799659729, + "num_tokens": 324080980.0, + "step": 8890 + }, + { + "epoch": 1.6510677808727947, + "grad_norm": 1.547118067741394, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8744058012962341, + "num_tokens": 324117805.0, + "step": 8891 + }, + { + "epoch": 1.6512534818941504, + "grad_norm": 1.3237953186035156, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8865932822227478, + "num_tokens": 324162487.0, + "step": 8892 + }, + { + "epoch": 1.6514391829155062, + "grad_norm": 1.4523550271987915, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8820098638534546, + "num_tokens": 324199249.0, + "step": 8893 + }, + { + "epoch": 1.6516248839368617, + "grad_norm": 1.629281759262085, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8652867078781128, + "num_tokens": 324234628.0, + "step": 8894 + }, + { + "epoch": 1.6518105849582172, + "grad_norm": 1.408470869064331, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.872130274772644, + "num_tokens": 324278068.0, + "step": 8895 + }, + { + "epoch": 1.651996285979573, + "grad_norm": 1.5768351554870605, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8759847283363342, + "num_tokens": 324315447.0, + "step": 8896 + }, + { + "epoch": 1.6521819870009287, + "grad_norm": 1.5361645221710205, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8706953525543213, + "num_tokens": 324354521.0, + "step": 8897 + }, + { + "epoch": 1.6523676880222842, + "grad_norm": 1.4542555809020996, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8846937417984009, + "num_tokens": 324390404.0, + "step": 8898 + }, + { + "epoch": 1.6525533890436397, + "grad_norm": 1.5326662063598633, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8876135349273682, + "num_tokens": 324423052.0, + "step": 8899 + }, + { + "epoch": 1.6527390900649954, + "grad_norm": 1.5484975576400757, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8732974529266357, + "num_tokens": 324459345.0, + "step": 8900 + }, + { + "epoch": 1.652924791086351, + "grad_norm": 1.523819923400879, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8919447064399719, + "num_tokens": 324493622.0, + "step": 8901 + }, + { + "epoch": 1.6531104921077064, + "grad_norm": 1.6439181566238403, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8873024582862854, + "num_tokens": 324521789.0, + "step": 8902 + }, + { + "epoch": 1.6532961931290622, + "grad_norm": 1.4511358737945557, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8737989664077759, + "num_tokens": 324564250.0, + "step": 8903 + }, + { + "epoch": 1.653481894150418, + "grad_norm": 1.7539923191070557, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8871567249298096, + "num_tokens": 324592339.0, + "step": 8904 + }, + { + "epoch": 1.6536675951717734, + "grad_norm": 1.5411065816879272, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8894987106323242, + "num_tokens": 324626884.0, + "step": 8905 + }, + { + "epoch": 1.653853296193129, + "grad_norm": 1.5403363704681396, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8755251169204712, + "num_tokens": 324664319.0, + "step": 8906 + }, + { + "epoch": 1.6540389972144847, + "grad_norm": 1.5380984544754028, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8785078525543213, + "num_tokens": 324700553.0, + "step": 8907 + }, + { + "epoch": 1.6542246982358404, + "grad_norm": 1.4310368299484253, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8896781802177429, + "num_tokens": 324740365.0, + "step": 8908 + }, + { + "epoch": 1.654410399257196, + "grad_norm": 1.433087706565857, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8834699988365173, + "num_tokens": 324776905.0, + "step": 8909 + }, + { + "epoch": 1.6545961002785514, + "grad_norm": 1.6566118001937866, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8587830066680908, + "num_tokens": 324809058.0, + "step": 8910 + }, + { + "epoch": 1.6547818012999072, + "grad_norm": 1.7002058029174805, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8693828582763672, + "num_tokens": 324845374.0, + "step": 8911 + }, + { + "epoch": 1.6549675023212629, + "grad_norm": 1.4609543085098267, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8640896081924438, + "num_tokens": 324887125.0, + "step": 8912 + }, + { + "epoch": 1.6551532033426184, + "grad_norm": 1.6875056028366089, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8592671155929565, + "num_tokens": 324923345.0, + "step": 8913 + }, + { + "epoch": 1.655338904363974, + "grad_norm": 1.4488555192947388, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8833168148994446, + "num_tokens": 324963292.0, + "step": 8914 + }, + { + "epoch": 1.6555246053853296, + "grad_norm": 1.4837970733642578, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8770084381103516, + "num_tokens": 325005147.0, + "step": 8915 + }, + { + "epoch": 1.6557103064066854, + "grad_norm": 1.5710564851760864, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8701132535934448, + "num_tokens": 325041159.0, + "step": 8916 + }, + { + "epoch": 1.6558960074280409, + "grad_norm": 1.4100686311721802, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8895869255065918, + "num_tokens": 325081509.0, + "step": 8917 + }, + { + "epoch": 1.6560817084493964, + "grad_norm": 1.5549907684326172, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.875686764717102, + "num_tokens": 325129321.0, + "step": 8918 + }, + { + "epoch": 1.6562674094707521, + "grad_norm": 1.6439383029937744, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8682554960250854, + "num_tokens": 325164961.0, + "step": 8919 + }, + { + "epoch": 1.6564531104921079, + "grad_norm": 1.619030475616455, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8832656145095825, + "num_tokens": 325196024.0, + "step": 8920 + }, + { + "epoch": 1.6566388115134634, + "grad_norm": 1.5264863967895508, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8842048645019531, + "num_tokens": 325229210.0, + "step": 8921 + }, + { + "epoch": 1.6568245125348189, + "grad_norm": 1.5528507232666016, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8645397424697876, + "num_tokens": 325265744.0, + "step": 8922 + }, + { + "epoch": 1.6570102135561746, + "grad_norm": 1.7673816680908203, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8770261406898499, + "num_tokens": 325296826.0, + "step": 8923 + }, + { + "epoch": 1.6571959145775301, + "grad_norm": 1.56832754611969, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8680279850959778, + "num_tokens": 325332713.0, + "step": 8924 + }, + { + "epoch": 1.6573816155988856, + "grad_norm": 1.541955828666687, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8876692056655884, + "num_tokens": 325369794.0, + "step": 8925 + }, + { + "epoch": 1.6575673166202414, + "grad_norm": 1.5758098363876343, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8734503984451294, + "num_tokens": 325405931.0, + "step": 8926 + }, + { + "epoch": 1.657753017641597, + "grad_norm": 1.5768636465072632, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8829952478408813, + "num_tokens": 325444477.0, + "step": 8927 + }, + { + "epoch": 1.6579387186629526, + "grad_norm": 1.513156771659851, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8755621910095215, + "num_tokens": 325481712.0, + "step": 8928 + }, + { + "epoch": 1.6581244196843081, + "grad_norm": 1.490841269493103, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8797736167907715, + "num_tokens": 325521133.0, + "step": 8929 + }, + { + "epoch": 1.6583101207056639, + "grad_norm": 1.5989502668380737, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8822850584983826, + "num_tokens": 325555498.0, + "step": 8930 + }, + { + "epoch": 1.6584958217270196, + "grad_norm": 1.5278223752975464, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8678532838821411, + "num_tokens": 325598592.0, + "step": 8931 + }, + { + "epoch": 1.658681522748375, + "grad_norm": 1.498925805091858, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.886579155921936, + "num_tokens": 325638349.0, + "step": 8932 + }, + { + "epoch": 1.6588672237697306, + "grad_norm": 1.4803962707519531, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8692365884780884, + "num_tokens": 325679127.0, + "step": 8933 + }, + { + "epoch": 1.6590529247910863, + "grad_norm": 1.439606785774231, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8893274068832397, + "num_tokens": 325714300.0, + "step": 8934 + }, + { + "epoch": 1.659238625812442, + "grad_norm": 1.4742038249969482, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8882384300231934, + "num_tokens": 325751173.0, + "step": 8935 + }, + { + "epoch": 1.6594243268337976, + "grad_norm": 1.5464144945144653, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8881992101669312, + "num_tokens": 325782737.0, + "step": 8936 + }, + { + "epoch": 1.659610027855153, + "grad_norm": 1.4059885740280151, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8823315501213074, + "num_tokens": 325823418.0, + "step": 8937 + }, + { + "epoch": 1.6597957288765088, + "grad_norm": 1.4649746417999268, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8721835613250732, + "num_tokens": 325865512.0, + "step": 8938 + }, + { + "epoch": 1.6599814298978646, + "grad_norm": 1.5750951766967773, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8789215683937073, + "num_tokens": 325899113.0, + "step": 8939 + }, + { + "epoch": 1.66016713091922, + "grad_norm": 1.5274426937103271, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8763697147369385, + "num_tokens": 325938050.0, + "step": 8940 + }, + { + "epoch": 1.6603528319405756, + "grad_norm": 1.43131422996521, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8793492913246155, + "num_tokens": 325977908.0, + "step": 8941 + }, + { + "epoch": 1.6605385329619313, + "grad_norm": 1.5404820442199707, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8634049892425537, + "num_tokens": 326020359.0, + "step": 8942 + }, + { + "epoch": 1.660724233983287, + "grad_norm": 1.6153942346572876, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8854464888572693, + "num_tokens": 326053032.0, + "step": 8943 + }, + { + "epoch": 1.6609099350046426, + "grad_norm": 1.5612667798995972, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8812791109085083, + "num_tokens": 326089093.0, + "step": 8944 + }, + { + "epoch": 1.661095636025998, + "grad_norm": 1.532824158668518, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8822696208953857, + "num_tokens": 326124499.0, + "step": 8945 + }, + { + "epoch": 1.6612813370473538, + "grad_norm": 1.5232104063034058, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8808890581130981, + "num_tokens": 326158801.0, + "step": 8946 + }, + { + "epoch": 1.6614670380687093, + "grad_norm": 1.7263360023498535, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8767496347427368, + "num_tokens": 326189529.0, + "step": 8947 + }, + { + "epoch": 1.6616527390900648, + "grad_norm": 1.61371648311615, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8775267004966736, + "num_tokens": 326225555.0, + "step": 8948 + }, + { + "epoch": 1.6618384401114206, + "grad_norm": 1.608005404472351, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8680450916290283, + "num_tokens": 326261859.0, + "step": 8949 + }, + { + "epoch": 1.6620241411327763, + "grad_norm": 1.7988673448562622, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8812026381492615, + "num_tokens": 326292981.0, + "step": 8950 + }, + { + "epoch": 1.6622098421541318, + "grad_norm": 1.6324255466461182, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8811109066009521, + "num_tokens": 326322372.0, + "step": 8951 + }, + { + "epoch": 1.6623955431754873, + "grad_norm": 1.4684315919876099, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8855776786804199, + "num_tokens": 326360299.0, + "step": 8952 + }, + { + "epoch": 1.662581244196843, + "grad_norm": 1.5246654748916626, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8791505098342896, + "num_tokens": 326394095.0, + "step": 8953 + }, + { + "epoch": 1.6627669452181988, + "grad_norm": 1.645269751548767, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8753410577774048, + "num_tokens": 326429804.0, + "step": 8954 + }, + { + "epoch": 1.6629526462395543, + "grad_norm": 1.7277954816818237, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8617423176765442, + "num_tokens": 326461280.0, + "step": 8955 + }, + { + "epoch": 1.6631383472609098, + "grad_norm": 1.5175001621246338, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8721144199371338, + "num_tokens": 326501948.0, + "step": 8956 + }, + { + "epoch": 1.6633240482822655, + "grad_norm": 1.461668848991394, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8903535008430481, + "num_tokens": 326538247.0, + "step": 8957 + }, + { + "epoch": 1.6635097493036213, + "grad_norm": 1.6401901245117188, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8820855021476746, + "num_tokens": 326569942.0, + "step": 8958 + }, + { + "epoch": 1.6636954503249768, + "grad_norm": 1.436434268951416, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8849560022354126, + "num_tokens": 326608463.0, + "step": 8959 + }, + { + "epoch": 1.6638811513463323, + "grad_norm": 1.6491167545318604, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8661118149757385, + "num_tokens": 326641961.0, + "step": 8960 + }, + { + "epoch": 1.664066852367688, + "grad_norm": 1.5247753858566284, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8751799464225769, + "num_tokens": 326682024.0, + "step": 8961 + }, + { + "epoch": 1.6642525533890438, + "grad_norm": 1.777406096458435, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8674348592758179, + "num_tokens": 326714429.0, + "step": 8962 + }, + { + "epoch": 1.6644382544103993, + "grad_norm": 1.4172776937484741, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8942494988441467, + "num_tokens": 326750334.0, + "step": 8963 + }, + { + "epoch": 1.6646239554317548, + "grad_norm": 1.4506160020828247, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.877173662185669, + "num_tokens": 326789830.0, + "step": 8964 + }, + { + "epoch": 1.6648096564531105, + "grad_norm": 1.5424659252166748, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8668338060379028, + "num_tokens": 326834405.0, + "step": 8965 + }, + { + "epoch": 1.6649953574744663, + "grad_norm": 1.5424503087997437, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8855777382850647, + "num_tokens": 326876271.0, + "step": 8966 + }, + { + "epoch": 1.6651810584958218, + "grad_norm": 1.3847483396530151, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.890207052230835, + "num_tokens": 326919156.0, + "step": 8967 + }, + { + "epoch": 1.6653667595171773, + "grad_norm": 1.474779725074768, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8682544231414795, + "num_tokens": 326960950.0, + "step": 8968 + }, + { + "epoch": 1.665552460538533, + "grad_norm": 1.6059458255767822, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8868796825408936, + "num_tokens": 326995327.0, + "step": 8969 + }, + { + "epoch": 1.6657381615598887, + "grad_norm": 1.4557119607925415, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8964073657989502, + "num_tokens": 327030711.0, + "step": 8970 + }, + { + "epoch": 1.665923862581244, + "grad_norm": 1.66823410987854, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.881394624710083, + "num_tokens": 327064595.0, + "step": 8971 + }, + { + "epoch": 1.6661095636025998, + "grad_norm": 1.5192228555679321, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8913575410842896, + "num_tokens": 327102080.0, + "step": 8972 + }, + { + "epoch": 1.6662952646239555, + "grad_norm": 1.5619405508041382, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8734363913536072, + "num_tokens": 327135572.0, + "step": 8973 + }, + { + "epoch": 1.666480965645311, + "grad_norm": 1.4877698421478271, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8773354291915894, + "num_tokens": 327175339.0, + "step": 8974 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.589493751525879, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8811014294624329, + "num_tokens": 327210986.0, + "step": 8975 + }, + { + "epoch": 1.6668523676880223, + "grad_norm": 1.6132547855377197, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8553593754768372, + "num_tokens": 327249612.0, + "step": 8976 + }, + { + "epoch": 1.667038068709378, + "grad_norm": 1.590108036994934, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8645055294036865, + "num_tokens": 327286718.0, + "step": 8977 + }, + { + "epoch": 1.6672237697307335, + "grad_norm": 1.4925135374069214, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8816156387329102, + "num_tokens": 327321843.0, + "step": 8978 + }, + { + "epoch": 1.667409470752089, + "grad_norm": 1.5028090476989746, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8864352703094482, + "num_tokens": 327360002.0, + "step": 8979 + }, + { + "epoch": 1.6675951717734447, + "grad_norm": 1.6150352954864502, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.872909426689148, + "num_tokens": 327393535.0, + "step": 8980 + }, + { + "epoch": 1.6677808727948005, + "grad_norm": 1.46867835521698, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8802759647369385, + "num_tokens": 327429485.0, + "step": 8981 + }, + { + "epoch": 1.667966573816156, + "grad_norm": 1.7789103984832764, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8782973289489746, + "num_tokens": 327458343.0, + "step": 8982 + }, + { + "epoch": 1.6681522748375115, + "grad_norm": 1.46290123462677, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8768441677093506, + "num_tokens": 327500347.0, + "step": 8983 + }, + { + "epoch": 1.6683379758588672, + "grad_norm": 1.5042519569396973, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8703628778457642, + "num_tokens": 327539746.0, + "step": 8984 + }, + { + "epoch": 1.668523676880223, + "grad_norm": 1.4589675664901733, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8910335302352905, + "num_tokens": 327578295.0, + "step": 8985 + }, + { + "epoch": 1.6687093779015785, + "grad_norm": 1.4622286558151245, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8780584335327148, + "num_tokens": 327617615.0, + "step": 8986 + }, + { + "epoch": 1.668895078922934, + "grad_norm": 1.6093491315841675, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8688762187957764, + "num_tokens": 327650383.0, + "step": 8987 + }, + { + "epoch": 1.6690807799442897, + "grad_norm": 1.561499834060669, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8801938891410828, + "num_tokens": 327686134.0, + "step": 8988 + }, + { + "epoch": 1.6692664809656454, + "grad_norm": 1.4132121801376343, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8918545842170715, + "num_tokens": 327722470.0, + "step": 8989 + }, + { + "epoch": 1.669452181987001, + "grad_norm": 1.574539303779602, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8809384107589722, + "num_tokens": 327759781.0, + "step": 8990 + }, + { + "epoch": 1.6696378830083565, + "grad_norm": 1.419562578201294, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8847895860671997, + "num_tokens": 327799116.0, + "step": 8991 + }, + { + "epoch": 1.6698235840297122, + "grad_norm": 1.5537192821502686, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8785475492477417, + "num_tokens": 327836798.0, + "step": 8992 + }, + { + "epoch": 1.670009285051068, + "grad_norm": 1.4532060623168945, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8872663974761963, + "num_tokens": 327875539.0, + "step": 8993 + }, + { + "epoch": 1.6701949860724234, + "grad_norm": 1.55532705783844, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8673220872879028, + "num_tokens": 327916094.0, + "step": 8994 + }, + { + "epoch": 1.670380687093779, + "grad_norm": 1.4822909832000732, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8730302453041077, + "num_tokens": 327957024.0, + "step": 8995 + }, + { + "epoch": 1.6705663881151347, + "grad_norm": 1.4706324338912964, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8864234089851379, + "num_tokens": 327998624.0, + "step": 8996 + }, + { + "epoch": 1.6707520891364902, + "grad_norm": 1.9470884799957275, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.869720458984375, + "num_tokens": 328026255.0, + "step": 8997 + }, + { + "epoch": 1.6709377901578457, + "grad_norm": 1.5407148599624634, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8713115453720093, + "num_tokens": 328065407.0, + "step": 8998 + }, + { + "epoch": 1.6711234911792014, + "grad_norm": 1.6203416585922241, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8858707547187805, + "num_tokens": 328097499.0, + "step": 8999 + }, + { + "epoch": 1.6713091922005572, + "grad_norm": 1.508146047592163, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8738717436790466, + "num_tokens": 328133784.0, + "step": 9000 + }, + { + "epoch": 1.6714948932219127, + "grad_norm": 1.5550719499588013, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8662394881248474, + "num_tokens": 328172982.0, + "step": 9001 + }, + { + "epoch": 1.6716805942432682, + "grad_norm": 1.6387807130813599, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8801056742668152, + "num_tokens": 328205852.0, + "step": 9002 + }, + { + "epoch": 1.671866295264624, + "grad_norm": 1.5909603834152222, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8879320025444031, + "num_tokens": 328236840.0, + "step": 9003 + }, + { + "epoch": 1.6720519962859797, + "grad_norm": 1.460198163986206, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8853120803833008, + "num_tokens": 328276555.0, + "step": 9004 + }, + { + "epoch": 1.6722376973073352, + "grad_norm": 1.5822300910949707, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8854596018791199, + "num_tokens": 328309707.0, + "step": 9005 + }, + { + "epoch": 1.6724233983286907, + "grad_norm": 1.6579878330230713, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.862671434879303, + "num_tokens": 328347738.0, + "step": 9006 + }, + { + "epoch": 1.6726090993500464, + "grad_norm": 1.6341264247894287, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8776862621307373, + "num_tokens": 328380242.0, + "step": 9007 + }, + { + "epoch": 1.6727948003714022, + "grad_norm": 1.4602075815200806, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.879865825176239, + "num_tokens": 328417043.0, + "step": 9008 + }, + { + "epoch": 1.6729805013927577, + "grad_norm": 1.6062920093536377, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8686201572418213, + "num_tokens": 328452469.0, + "step": 9009 + }, + { + "epoch": 1.6731662024141132, + "grad_norm": 1.6808353662490845, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8797832727432251, + "num_tokens": 328482559.0, + "step": 9010 + }, + { + "epoch": 1.673351903435469, + "grad_norm": 1.5950989723205566, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8633173704147339, + "num_tokens": 328517649.0, + "step": 9011 + }, + { + "epoch": 1.6735376044568246, + "grad_norm": 1.4130825996398926, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8827613592147827, + "num_tokens": 328556980.0, + "step": 9012 + }, + { + "epoch": 1.6737233054781802, + "grad_norm": 1.4347350597381592, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8628820180892944, + "num_tokens": 328601015.0, + "step": 9013 + }, + { + "epoch": 1.6739090064995357, + "grad_norm": 1.390270709991455, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8704506158828735, + "num_tokens": 328651167.0, + "step": 9014 + }, + { + "epoch": 1.6740947075208914, + "grad_norm": 1.5997166633605957, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8685247302055359, + "num_tokens": 328690136.0, + "step": 9015 + }, + { + "epoch": 1.6742804085422471, + "grad_norm": 1.383676290512085, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8785792589187622, + "num_tokens": 328733612.0, + "step": 9016 + }, + { + "epoch": 1.6744661095636026, + "grad_norm": 1.7019786834716797, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8776189684867859, + "num_tokens": 328765500.0, + "step": 9017 + }, + { + "epoch": 1.6746518105849582, + "grad_norm": 1.5582847595214844, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8833633661270142, + "num_tokens": 328799576.0, + "step": 9018 + }, + { + "epoch": 1.674837511606314, + "grad_norm": 1.4781672954559326, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8714481592178345, + "num_tokens": 328841005.0, + "step": 9019 + }, + { + "epoch": 1.6750232126276694, + "grad_norm": 1.6081018447875977, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8789466619491577, + "num_tokens": 328874103.0, + "step": 9020 + }, + { + "epoch": 1.675208913649025, + "grad_norm": 1.5842931270599365, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8636597394943237, + "num_tokens": 328911491.0, + "step": 9021 + }, + { + "epoch": 1.6753946146703806, + "grad_norm": 1.6089096069335938, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.883980393409729, + "num_tokens": 328947627.0, + "step": 9022 + }, + { + "epoch": 1.6755803156917364, + "grad_norm": 1.6235228776931763, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.880903422832489, + "num_tokens": 328981386.0, + "step": 9023 + }, + { + "epoch": 1.6757660167130919, + "grad_norm": 1.6531754732131958, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8634628057479858, + "num_tokens": 329015946.0, + "step": 9024 + }, + { + "epoch": 1.6759517177344474, + "grad_norm": 1.4981937408447266, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8793588876724243, + "num_tokens": 329053114.0, + "step": 9025 + }, + { + "epoch": 1.6761374187558031, + "grad_norm": 1.595942735671997, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8697014451026917, + "num_tokens": 329091520.0, + "step": 9026 + }, + { + "epoch": 1.6763231197771589, + "grad_norm": 1.5378376245498657, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8854406476020813, + "num_tokens": 329126337.0, + "step": 9027 + }, + { + "epoch": 1.6765088207985144, + "grad_norm": 1.5256690979003906, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.883313000202179, + "num_tokens": 329160953.0, + "step": 9028 + }, + { + "epoch": 1.6766945218198699, + "grad_norm": 1.5953731536865234, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.864305853843689, + "num_tokens": 329199695.0, + "step": 9029 + }, + { + "epoch": 1.6768802228412256, + "grad_norm": 1.6461552381515503, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8767277002334595, + "num_tokens": 329229939.0, + "step": 9030 + }, + { + "epoch": 1.6770659238625814, + "grad_norm": 1.5567126274108887, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8752435445785522, + "num_tokens": 329264229.0, + "step": 9031 + }, + { + "epoch": 1.6772516248839369, + "grad_norm": 1.5324757099151611, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8764472007751465, + "num_tokens": 329297874.0, + "step": 9032 + }, + { + "epoch": 1.6774373259052924, + "grad_norm": 1.5863749980926514, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8775494694709778, + "num_tokens": 329331947.0, + "step": 9033 + }, + { + "epoch": 1.677623026926648, + "grad_norm": 1.503868579864502, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8670955896377563, + "num_tokens": 329374701.0, + "step": 9034 + }, + { + "epoch": 1.6778087279480038, + "grad_norm": 1.5458928346633911, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8837587833404541, + "num_tokens": 329418213.0, + "step": 9035 + }, + { + "epoch": 1.6779944289693594, + "grad_norm": 1.5744788646697998, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.877506673336029, + "num_tokens": 329452269.0, + "step": 9036 + }, + { + "epoch": 1.6781801299907149, + "grad_norm": 1.55303955078125, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8826866745948792, + "num_tokens": 329487051.0, + "step": 9037 + }, + { + "epoch": 1.6783658310120706, + "grad_norm": 1.595359444618225, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8724532127380371, + "num_tokens": 329523759.0, + "step": 9038 + }, + { + "epoch": 1.6785515320334263, + "grad_norm": 1.60660719871521, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.877746045589447, + "num_tokens": 329558073.0, + "step": 9039 + }, + { + "epoch": 1.6787372330547818, + "grad_norm": 1.6233855485916138, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8738330006599426, + "num_tokens": 329591736.0, + "step": 9040 + }, + { + "epoch": 1.6789229340761374, + "grad_norm": 1.694949984550476, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8663268685340881, + "num_tokens": 329622737.0, + "step": 9041 + }, + { + "epoch": 1.679108635097493, + "grad_norm": 1.6403453350067139, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8694571256637573, + "num_tokens": 329658871.0, + "step": 9042 + }, + { + "epoch": 1.6792943361188488, + "grad_norm": 1.4848493337631226, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.882082462310791, + "num_tokens": 329695496.0, + "step": 9043 + }, + { + "epoch": 1.679480037140204, + "grad_norm": 1.5613725185394287, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8791594505310059, + "num_tokens": 329730191.0, + "step": 9044 + }, + { + "epoch": 1.6796657381615598, + "grad_norm": 1.4825377464294434, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8802510499954224, + "num_tokens": 329769334.0, + "step": 9045 + }, + { + "epoch": 1.6798514391829156, + "grad_norm": 1.5795414447784424, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8776270151138306, + "num_tokens": 329802014.0, + "step": 9046 + }, + { + "epoch": 1.680037140204271, + "grad_norm": 1.534908413887024, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8785520792007446, + "num_tokens": 329836127.0, + "step": 9047 + }, + { + "epoch": 1.6802228412256266, + "grad_norm": 1.456709861755371, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8746687173843384, + "num_tokens": 329872901.0, + "step": 9048 + }, + { + "epoch": 1.6804085422469823, + "grad_norm": 1.5975701808929443, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8804991841316223, + "num_tokens": 329904848.0, + "step": 9049 + }, + { + "epoch": 1.680594243268338, + "grad_norm": 1.6584116220474243, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8645896911621094, + "num_tokens": 329940187.0, + "step": 9050 + }, + { + "epoch": 1.6807799442896936, + "grad_norm": 1.6139979362487793, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8727346658706665, + "num_tokens": 329974237.0, + "step": 9051 + }, + { + "epoch": 1.680965645311049, + "grad_norm": 1.6056956052780151, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.879948616027832, + "num_tokens": 330007407.0, + "step": 9052 + }, + { + "epoch": 1.6811513463324048, + "grad_norm": 1.7525192499160767, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8675726652145386, + "num_tokens": 330039934.0, + "step": 9053 + }, + { + "epoch": 1.6813370473537605, + "grad_norm": 1.4924753904342651, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8838792443275452, + "num_tokens": 330078897.0, + "step": 9054 + }, + { + "epoch": 1.681522748375116, + "grad_norm": 1.5587570667266846, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8749421238899231, + "num_tokens": 330116629.0, + "step": 9055 + }, + { + "epoch": 1.6817084493964716, + "grad_norm": 1.5579235553741455, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8758647441864014, + "num_tokens": 330149527.0, + "step": 9056 + }, + { + "epoch": 1.6818941504178273, + "grad_norm": 1.5756797790527344, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8879507184028625, + "num_tokens": 330180759.0, + "step": 9057 + }, + { + "epoch": 1.682079851439183, + "grad_norm": 1.6301532983779907, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8895050287246704, + "num_tokens": 330215827.0, + "step": 9058 + }, + { + "epoch": 1.6822655524605385, + "grad_norm": 1.5617905855178833, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8786093592643738, + "num_tokens": 330253536.0, + "step": 9059 + }, + { + "epoch": 1.682451253481894, + "grad_norm": 1.6510262489318848, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8771473169326782, + "num_tokens": 330289202.0, + "step": 9060 + }, + { + "epoch": 1.6826369545032498, + "grad_norm": 1.6585372686386108, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8675746917724609, + "num_tokens": 330325440.0, + "step": 9061 + }, + { + "epoch": 1.6828226555246055, + "grad_norm": 1.5904524326324463, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8706503510475159, + "num_tokens": 330360981.0, + "step": 9062 + }, + { + "epoch": 1.683008356545961, + "grad_norm": 1.5561014413833618, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8841965198516846, + "num_tokens": 330396399.0, + "step": 9063 + }, + { + "epoch": 1.6831940575673165, + "grad_norm": 1.6392191648483276, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8751462697982788, + "num_tokens": 330430798.0, + "step": 9064 + }, + { + "epoch": 1.6833797585886723, + "grad_norm": 1.5989549160003662, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8779701590538025, + "num_tokens": 330465734.0, + "step": 9065 + }, + { + "epoch": 1.683565459610028, + "grad_norm": 1.5669701099395752, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8691142797470093, + "num_tokens": 330501353.0, + "step": 9066 + }, + { + "epoch": 1.6837511606313835, + "grad_norm": 1.4603266716003418, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8909650444984436, + "num_tokens": 330539347.0, + "step": 9067 + }, + { + "epoch": 1.683936861652739, + "grad_norm": 1.534751296043396, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8773033022880554, + "num_tokens": 330577289.0, + "step": 9068 + }, + { + "epoch": 1.6841225626740948, + "grad_norm": 1.4497480392456055, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.893053412437439, + "num_tokens": 330613654.0, + "step": 9069 + }, + { + "epoch": 1.6843082636954503, + "grad_norm": 1.5554370880126953, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8691387176513672, + "num_tokens": 330649686.0, + "step": 9070 + }, + { + "epoch": 1.6844939647168058, + "grad_norm": 1.554940938949585, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8868193626403809, + "num_tokens": 330685381.0, + "step": 9071 + }, + { + "epoch": 1.6846796657381615, + "grad_norm": 1.4959083795547485, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8944500684738159, + "num_tokens": 330721408.0, + "step": 9072 + }, + { + "epoch": 1.6848653667595173, + "grad_norm": 1.5660550594329834, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.881346583366394, + "num_tokens": 330756143.0, + "step": 9073 + }, + { + "epoch": 1.6850510677808728, + "grad_norm": 1.4424076080322266, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8902024030685425, + "num_tokens": 330795598.0, + "step": 9074 + }, + { + "epoch": 1.6852367688022283, + "grad_norm": 1.6652867794036865, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.873138427734375, + "num_tokens": 330827272.0, + "step": 9075 + }, + { + "epoch": 1.685422469823584, + "grad_norm": 1.4423657655715942, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8830544948577881, + "num_tokens": 330865324.0, + "step": 9076 + }, + { + "epoch": 1.6856081708449397, + "grad_norm": 1.5353913307189941, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8770427703857422, + "num_tokens": 330900697.0, + "step": 9077 + }, + { + "epoch": 1.6857938718662953, + "grad_norm": 1.5553224086761475, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8694548606872559, + "num_tokens": 330939116.0, + "step": 9078 + }, + { + "epoch": 1.6859795728876508, + "grad_norm": 1.7376015186309814, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.871564507484436, + "num_tokens": 330972901.0, + "step": 9079 + }, + { + "epoch": 1.6861652739090065, + "grad_norm": 1.5821067094802856, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8705452680587769, + "num_tokens": 331011472.0, + "step": 9080 + }, + { + "epoch": 1.6863509749303622, + "grad_norm": 1.6264889240264893, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8725460171699524, + "num_tokens": 331047612.0, + "step": 9081 + }, + { + "epoch": 1.6865366759517177, + "grad_norm": 1.6065279245376587, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8648468255996704, + "num_tokens": 331085079.0, + "step": 9082 + }, + { + "epoch": 1.6867223769730733, + "grad_norm": 1.40464186668396, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8848051428794861, + "num_tokens": 331121849.0, + "step": 9083 + }, + { + "epoch": 1.686908077994429, + "grad_norm": 1.5692356824874878, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8744456768035889, + "num_tokens": 331155576.0, + "step": 9084 + }, + { + "epoch": 1.6870937790157847, + "grad_norm": 1.679605484008789, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8745768070220947, + "num_tokens": 331191340.0, + "step": 9085 + }, + { + "epoch": 1.6872794800371402, + "grad_norm": 1.559493064880371, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8813508152961731, + "num_tokens": 331226725.0, + "step": 9086 + }, + { + "epoch": 1.6874651810584957, + "grad_norm": 1.603374719619751, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8781185150146484, + "num_tokens": 331263933.0, + "step": 9087 + }, + { + "epoch": 1.6876508820798515, + "grad_norm": 1.4555513858795166, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8761999607086182, + "num_tokens": 331304629.0, + "step": 9088 + }, + { + "epoch": 1.6878365831012072, + "grad_norm": 1.5412837266921997, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8749487996101379, + "num_tokens": 331342896.0, + "step": 9089 + }, + { + "epoch": 1.6880222841225627, + "grad_norm": 1.5305505990982056, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8716009855270386, + "num_tokens": 331378859.0, + "step": 9090 + }, + { + "epoch": 1.6882079851439182, + "grad_norm": 1.4969152212142944, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8881891369819641, + "num_tokens": 331413047.0, + "step": 9091 + }, + { + "epoch": 1.688393686165274, + "grad_norm": 1.4885451793670654, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8933340311050415, + "num_tokens": 331448883.0, + "step": 9092 + }, + { + "epoch": 1.6885793871866295, + "grad_norm": 1.7077888250350952, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8858292102813721, + "num_tokens": 331476162.0, + "step": 9093 + }, + { + "epoch": 1.688765088207985, + "grad_norm": 1.536045789718628, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8597412109375, + "num_tokens": 331513903.0, + "step": 9094 + }, + { + "epoch": 1.6889507892293407, + "grad_norm": 1.6014999151229858, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8843691349029541, + "num_tokens": 331547951.0, + "step": 9095 + }, + { + "epoch": 1.6891364902506965, + "grad_norm": 1.5154000520706177, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8873148560523987, + "num_tokens": 331581705.0, + "step": 9096 + }, + { + "epoch": 1.689322191272052, + "grad_norm": 1.5233036279678345, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8685671091079712, + "num_tokens": 331619377.0, + "step": 9097 + }, + { + "epoch": 1.6895078922934075, + "grad_norm": 1.4771865606307983, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8689154386520386, + "num_tokens": 331659438.0, + "step": 9098 + }, + { + "epoch": 1.6896935933147632, + "grad_norm": 1.5457251071929932, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8836269974708557, + "num_tokens": 331694798.0, + "step": 9099 + }, + { + "epoch": 1.689879294336119, + "grad_norm": 1.4240285158157349, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8819692134857178, + "num_tokens": 331737307.0, + "step": 9100 + }, + { + "epoch": 1.6900649953574745, + "grad_norm": 1.5648248195648193, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8592625260353088, + "num_tokens": 331776244.0, + "step": 9101 + }, + { + "epoch": 1.69025069637883, + "grad_norm": 1.5996284484863281, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8712613582611084, + "num_tokens": 331810488.0, + "step": 9102 + }, + { + "epoch": 1.6904363974001857, + "grad_norm": 1.6732343435287476, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8753562569618225, + "num_tokens": 331846950.0, + "step": 9103 + }, + { + "epoch": 1.6906220984215414, + "grad_norm": 1.829176664352417, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8763306140899658, + "num_tokens": 331885765.0, + "step": 9104 + }, + { + "epoch": 1.690807799442897, + "grad_norm": 1.4302363395690918, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8849654793739319, + "num_tokens": 331925888.0, + "step": 9105 + }, + { + "epoch": 1.6909935004642525, + "grad_norm": 1.5082260370254517, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8747537136077881, + "num_tokens": 331963790.0, + "step": 9106 + }, + { + "epoch": 1.6911792014856082, + "grad_norm": 1.7000728845596313, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8719097375869751, + "num_tokens": 331996934.0, + "step": 9107 + }, + { + "epoch": 1.691364902506964, + "grad_norm": 1.5312453508377075, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8861314654350281, + "num_tokens": 332031904.0, + "step": 9108 + }, + { + "epoch": 1.6915506035283194, + "grad_norm": 1.307067632675171, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.893613338470459, + "num_tokens": 332076489.0, + "step": 9109 + }, + { + "epoch": 1.691736304549675, + "grad_norm": 1.4145289659500122, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.877143144607544, + "num_tokens": 332116204.0, + "step": 9110 + }, + { + "epoch": 1.6919220055710307, + "grad_norm": 1.6150214672088623, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8739003539085388, + "num_tokens": 332149669.0, + "step": 9111 + }, + { + "epoch": 1.6921077065923864, + "grad_norm": 1.4867031574249268, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8776722550392151, + "num_tokens": 332187969.0, + "step": 9112 + }, + { + "epoch": 1.692293407613742, + "grad_norm": 1.409218430519104, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8768821954727173, + "num_tokens": 332228706.0, + "step": 9113 + }, + { + "epoch": 1.6924791086350974, + "grad_norm": 1.4449293613433838, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8757110834121704, + "num_tokens": 332273203.0, + "step": 9114 + }, + { + "epoch": 1.6926648096564532, + "grad_norm": 1.4452729225158691, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8718047738075256, + "num_tokens": 332317532.0, + "step": 9115 + }, + { + "epoch": 1.6928505106778087, + "grad_norm": 1.6462223529815674, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8679472208023071, + "num_tokens": 332353352.0, + "step": 9116 + }, + { + "epoch": 1.6930362116991642, + "grad_norm": 1.4175662994384766, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8735823631286621, + "num_tokens": 332398338.0, + "step": 9117 + }, + { + "epoch": 1.69322191272052, + "grad_norm": 1.4957903623580933, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8805360794067383, + "num_tokens": 332438235.0, + "step": 9118 + }, + { + "epoch": 1.6934076137418757, + "grad_norm": 1.6963863372802734, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8740646839141846, + "num_tokens": 332469579.0, + "step": 9119 + }, + { + "epoch": 1.6935933147632312, + "grad_norm": 1.4452853202819824, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8862125873565674, + "num_tokens": 332508145.0, + "step": 9120 + }, + { + "epoch": 1.6937790157845867, + "grad_norm": 1.5995744466781616, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8690711259841919, + "num_tokens": 332544400.0, + "step": 9121 + }, + { + "epoch": 1.6939647168059424, + "grad_norm": 1.63810133934021, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8806719779968262, + "num_tokens": 332580352.0, + "step": 9122 + }, + { + "epoch": 1.6941504178272981, + "grad_norm": 1.5619806051254272, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8722918033599854, + "num_tokens": 332620867.0, + "step": 9123 + }, + { + "epoch": 1.6943361188486536, + "grad_norm": 1.5141048431396484, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8865446448326111, + "num_tokens": 332655619.0, + "step": 9124 + }, + { + "epoch": 1.6945218198700092, + "grad_norm": 1.5632072687149048, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8703715801239014, + "num_tokens": 332692476.0, + "step": 9125 + }, + { + "epoch": 1.694707520891365, + "grad_norm": 1.4446523189544678, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8892567753791809, + "num_tokens": 332727107.0, + "step": 9126 + }, + { + "epoch": 1.6948932219127206, + "grad_norm": 1.4959396123886108, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8904269933700562, + "num_tokens": 332762129.0, + "step": 9127 + }, + { + "epoch": 1.6950789229340761, + "grad_norm": 1.667614221572876, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.866056501865387, + "num_tokens": 332797166.0, + "step": 9128 + }, + { + "epoch": 1.6952646239554316, + "grad_norm": 1.4944822788238525, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8851971626281738, + "num_tokens": 332834542.0, + "step": 9129 + }, + { + "epoch": 1.6954503249767874, + "grad_norm": 1.6155667304992676, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.880028486251831, + "num_tokens": 332871132.0, + "step": 9130 + }, + { + "epoch": 1.6956360259981431, + "grad_norm": 1.5974855422973633, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8632370233535767, + "num_tokens": 332905983.0, + "step": 9131 + }, + { + "epoch": 1.6958217270194986, + "grad_norm": 1.6934677362442017, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8790974617004395, + "num_tokens": 332936533.0, + "step": 9132 + }, + { + "epoch": 1.6960074280408541, + "grad_norm": 1.560375452041626, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8892682194709778, + "num_tokens": 332968059.0, + "step": 9133 + }, + { + "epoch": 1.6961931290622099, + "grad_norm": 1.566838026046753, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8773247003555298, + "num_tokens": 333006513.0, + "step": 9134 + }, + { + "epoch": 1.6963788300835656, + "grad_norm": 1.6560657024383545, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8730369806289673, + "num_tokens": 333043133.0, + "step": 9135 + }, + { + "epoch": 1.6965645311049211, + "grad_norm": 1.672263503074646, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8734922409057617, + "num_tokens": 333078313.0, + "step": 9136 + }, + { + "epoch": 1.6967502321262766, + "grad_norm": 1.5571563243865967, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8789347410202026, + "num_tokens": 333110151.0, + "step": 9137 + }, + { + "epoch": 1.6969359331476324, + "grad_norm": 1.628226637840271, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8741719722747803, + "num_tokens": 333144485.0, + "step": 9138 + }, + { + "epoch": 1.697121634168988, + "grad_norm": 1.4747315645217896, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8824466466903687, + "num_tokens": 333181541.0, + "step": 9139 + }, + { + "epoch": 1.6973073351903436, + "grad_norm": 1.5451457500457764, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8764981031417847, + "num_tokens": 333217968.0, + "step": 9140 + }, + { + "epoch": 1.6974930362116991, + "grad_norm": 1.422041654586792, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8890745639801025, + "num_tokens": 333255099.0, + "step": 9141 + }, + { + "epoch": 1.6976787372330548, + "grad_norm": 1.6364299058914185, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8770007491111755, + "num_tokens": 333289660.0, + "step": 9142 + }, + { + "epoch": 1.6978644382544104, + "grad_norm": 1.5274618864059448, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.868990421295166, + "num_tokens": 333326652.0, + "step": 9143 + }, + { + "epoch": 1.6980501392757659, + "grad_norm": 1.5077797174453735, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8724302053451538, + "num_tokens": 333361252.0, + "step": 9144 + }, + { + "epoch": 1.6982358402971216, + "grad_norm": 1.5971661806106567, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8938918709754944, + "num_tokens": 333391421.0, + "step": 9145 + }, + { + "epoch": 1.6984215413184773, + "grad_norm": 1.4928981065750122, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8792507648468018, + "num_tokens": 333428133.0, + "step": 9146 + }, + { + "epoch": 1.6986072423398328, + "grad_norm": 1.6407053470611572, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8655921816825867, + "num_tokens": 333464511.0, + "step": 9147 + }, + { + "epoch": 1.6987929433611884, + "grad_norm": 1.4932953119277954, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8850946426391602, + "num_tokens": 333501301.0, + "step": 9148 + }, + { + "epoch": 1.698978644382544, + "grad_norm": 1.6011168956756592, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8688803911209106, + "num_tokens": 333532929.0, + "step": 9149 + }, + { + "epoch": 1.6991643454038998, + "grad_norm": 1.4845761060714722, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8863641619682312, + "num_tokens": 333570188.0, + "step": 9150 + }, + { + "epoch": 1.6993500464252553, + "grad_norm": 1.662381649017334, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8914371728897095, + "num_tokens": 333598622.0, + "step": 9151 + }, + { + "epoch": 1.6995357474466108, + "grad_norm": 1.6441110372543335, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8719056844711304, + "num_tokens": 333633396.0, + "step": 9152 + }, + { + "epoch": 1.6997214484679666, + "grad_norm": 1.5996031761169434, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8811691403388977, + "num_tokens": 333665664.0, + "step": 9153 + }, + { + "epoch": 1.6999071494893223, + "grad_norm": 1.614027500152588, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8786477446556091, + "num_tokens": 333695202.0, + "step": 9154 + }, + { + "epoch": 1.7000928505106778, + "grad_norm": 1.5533031225204468, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8636986017227173, + "num_tokens": 333737223.0, + "step": 9155 + }, + { + "epoch": 1.7002785515320333, + "grad_norm": 1.462302565574646, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8750185370445251, + "num_tokens": 333777165.0, + "step": 9156 + }, + { + "epoch": 1.700464252553389, + "grad_norm": 1.5084097385406494, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8834939002990723, + "num_tokens": 333814838.0, + "step": 9157 + }, + { + "epoch": 1.7006499535747448, + "grad_norm": 1.5444060564041138, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.867392361164093, + "num_tokens": 333855124.0, + "step": 9158 + }, + { + "epoch": 1.7008356545961003, + "grad_norm": 1.6408058404922485, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.864075243473053, + "num_tokens": 333890806.0, + "step": 9159 + }, + { + "epoch": 1.7010213556174558, + "grad_norm": 1.5374051332473755, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8877739310264587, + "num_tokens": 333927049.0, + "step": 9160 + }, + { + "epoch": 1.7012070566388116, + "grad_norm": 1.55675208568573, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8575406670570374, + "num_tokens": 333964937.0, + "step": 9161 + }, + { + "epoch": 1.7013927576601673, + "grad_norm": 1.6058582067489624, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8869779109954834, + "num_tokens": 333997663.0, + "step": 9162 + }, + { + "epoch": 1.7015784586815228, + "grad_norm": 1.5398623943328857, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8761805891990662, + "num_tokens": 334035997.0, + "step": 9163 + }, + { + "epoch": 1.7017641597028783, + "grad_norm": 1.3608068227767944, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8949398398399353, + "num_tokens": 334075266.0, + "step": 9164 + }, + { + "epoch": 1.701949860724234, + "grad_norm": 1.5189554691314697, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8901185989379883, + "num_tokens": 334109178.0, + "step": 9165 + }, + { + "epoch": 1.7021355617455896, + "grad_norm": 1.592963457107544, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8738662004470825, + "num_tokens": 334144346.0, + "step": 9166 + }, + { + "epoch": 1.702321262766945, + "grad_norm": 1.4692022800445557, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8852075338363647, + "num_tokens": 334181760.0, + "step": 9167 + }, + { + "epoch": 1.7025069637883008, + "grad_norm": 1.544994831085205, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8621647357940674, + "num_tokens": 334223857.0, + "step": 9168 + }, + { + "epoch": 1.7026926648096565, + "grad_norm": 1.506847858428955, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8780019283294678, + "num_tokens": 334263914.0, + "step": 9169 + }, + { + "epoch": 1.702878365831012, + "grad_norm": 1.7976980209350586, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8729385733604431, + "num_tokens": 334297161.0, + "step": 9170 + }, + { + "epoch": 1.7030640668523676, + "grad_norm": 1.551114797592163, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8786039352416992, + "num_tokens": 334332909.0, + "step": 9171 + }, + { + "epoch": 1.7032497678737233, + "grad_norm": 1.4960172176361084, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8863655924797058, + "num_tokens": 334372320.0, + "step": 9172 + }, + { + "epoch": 1.703435468895079, + "grad_norm": 1.612140417098999, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8768162131309509, + "num_tokens": 334404787.0, + "step": 9173 + }, + { + "epoch": 1.7036211699164345, + "grad_norm": 1.5472941398620605, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8644173741340637, + "num_tokens": 334443333.0, + "step": 9174 + }, + { + "epoch": 1.70380687093779, + "grad_norm": 1.5232229232788086, + "learning_rate": 1e-06, + "loss": 0.2759, + "mean_token_accuracy": 0.8995275497436523, + "num_tokens": 334478993.0, + "step": 9175 + }, + { + "epoch": 1.7039925719591458, + "grad_norm": 1.6635069847106934, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8734473586082458, + "num_tokens": 334513815.0, + "step": 9176 + }, + { + "epoch": 1.7041782729805015, + "grad_norm": 1.5272669792175293, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8708599805831909, + "num_tokens": 334553516.0, + "step": 9177 + }, + { + "epoch": 1.704363974001857, + "grad_norm": 1.39927077293396, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8851302862167358, + "num_tokens": 334593887.0, + "step": 9178 + }, + { + "epoch": 1.7045496750232125, + "grad_norm": 1.6332018375396729, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8756765127182007, + "num_tokens": 334629762.0, + "step": 9179 + }, + { + "epoch": 1.7047353760445683, + "grad_norm": 1.5277074575424194, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.862259030342102, + "num_tokens": 334671175.0, + "step": 9180 + }, + { + "epoch": 1.704921077065924, + "grad_norm": 1.4291507005691528, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8815884590148926, + "num_tokens": 334706629.0, + "step": 9181 + }, + { + "epoch": 1.7051067780872795, + "grad_norm": 1.5858550071716309, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.887704610824585, + "num_tokens": 334742429.0, + "step": 9182 + }, + { + "epoch": 1.705292479108635, + "grad_norm": 1.548455834388733, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8736063838005066, + "num_tokens": 334781671.0, + "step": 9183 + }, + { + "epoch": 1.7054781801299908, + "grad_norm": 1.4961967468261719, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8702285885810852, + "num_tokens": 334818187.0, + "step": 9184 + }, + { + "epoch": 1.7056638811513465, + "grad_norm": 1.4049131870269775, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8967546820640564, + "num_tokens": 334854052.0, + "step": 9185 + }, + { + "epoch": 1.705849582172702, + "grad_norm": 1.510503888130188, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8726862668991089, + "num_tokens": 334891685.0, + "step": 9186 + }, + { + "epoch": 1.7060352831940575, + "grad_norm": 1.6034479141235352, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8740981817245483, + "num_tokens": 334924233.0, + "step": 9187 + }, + { + "epoch": 1.7062209842154132, + "grad_norm": 1.5489003658294678, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8748598098754883, + "num_tokens": 334964108.0, + "step": 9188 + }, + { + "epoch": 1.7064066852367687, + "grad_norm": 1.4537436962127686, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.888297438621521, + "num_tokens": 335001839.0, + "step": 9189 + }, + { + "epoch": 1.7065923862581243, + "grad_norm": 1.5112249851226807, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8773360252380371, + "num_tokens": 335041933.0, + "step": 9190 + }, + { + "epoch": 1.70677808727948, + "grad_norm": 1.5684123039245605, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8836297988891602, + "num_tokens": 335075134.0, + "step": 9191 + }, + { + "epoch": 1.7069637883008357, + "grad_norm": 1.5185683965682983, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8732571005821228, + "num_tokens": 335113630.0, + "step": 9192 + }, + { + "epoch": 1.7071494893221912, + "grad_norm": 1.573887586593628, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8691222667694092, + "num_tokens": 335151575.0, + "step": 9193 + }, + { + "epoch": 1.7073351903435467, + "grad_norm": 1.680681824684143, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8749617338180542, + "num_tokens": 335183455.0, + "step": 9194 + }, + { + "epoch": 1.7075208913649025, + "grad_norm": 1.478846788406372, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8627774119377136, + "num_tokens": 335225747.0, + "step": 9195 + }, + { + "epoch": 1.7077065923862582, + "grad_norm": 1.4365191459655762, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8881499171257019, + "num_tokens": 335261671.0, + "step": 9196 + }, + { + "epoch": 1.7078922934076137, + "grad_norm": 1.4133497476577759, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.873404860496521, + "num_tokens": 335302742.0, + "step": 9197 + }, + { + "epoch": 1.7080779944289692, + "grad_norm": 1.6627286672592163, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.846602201461792, + "num_tokens": 335341688.0, + "step": 9198 + }, + { + "epoch": 1.708263695450325, + "grad_norm": 1.611136555671692, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.8974271416664124, + "num_tokens": 335372616.0, + "step": 9199 + }, + { + "epoch": 1.7084493964716807, + "grad_norm": 1.5942848920822144, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8632470369338989, + "num_tokens": 335409726.0, + "step": 9200 + }, + { + "epoch": 1.7086350974930362, + "grad_norm": 1.6037718057632446, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8787297010421753, + "num_tokens": 335442350.0, + "step": 9201 + }, + { + "epoch": 1.7088207985143917, + "grad_norm": 1.5026357173919678, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8652936816215515, + "num_tokens": 335482826.0, + "step": 9202 + }, + { + "epoch": 1.7090064995357475, + "grad_norm": 1.4331862926483154, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8809746503829956, + "num_tokens": 335527548.0, + "step": 9203 + }, + { + "epoch": 1.7091922005571032, + "grad_norm": 1.490427017211914, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8925426602363586, + "num_tokens": 335560317.0, + "step": 9204 + }, + { + "epoch": 1.7093779015784587, + "grad_norm": 1.4209632873535156, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8673114776611328, + "num_tokens": 335604855.0, + "step": 9205 + }, + { + "epoch": 1.7095636025998142, + "grad_norm": 1.4040716886520386, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8784204125404358, + "num_tokens": 335644913.0, + "step": 9206 + }, + { + "epoch": 1.70974930362117, + "grad_norm": 1.690629005432129, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8604714274406433, + "num_tokens": 335680676.0, + "step": 9207 + }, + { + "epoch": 1.7099350046425257, + "grad_norm": 1.650354027748108, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8854472637176514, + "num_tokens": 335711356.0, + "step": 9208 + }, + { + "epoch": 1.7101207056638812, + "grad_norm": 1.4413717985153198, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8823264837265015, + "num_tokens": 335754782.0, + "step": 9209 + }, + { + "epoch": 1.7103064066852367, + "grad_norm": 1.5546685457229614, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8548869490623474, + "num_tokens": 335793475.0, + "step": 9210 + }, + { + "epoch": 1.7104921077065924, + "grad_norm": 1.4871007204055786, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8943753242492676, + "num_tokens": 335829401.0, + "step": 9211 + }, + { + "epoch": 1.7106778087279482, + "grad_norm": 1.665901780128479, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8735311031341553, + "num_tokens": 335864805.0, + "step": 9212 + }, + { + "epoch": 1.7108635097493035, + "grad_norm": 1.617550253868103, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8785375356674194, + "num_tokens": 335896519.0, + "step": 9213 + }, + { + "epoch": 1.7110492107706592, + "grad_norm": 1.5394794940948486, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8822999596595764, + "num_tokens": 335930242.0, + "step": 9214 + }, + { + "epoch": 1.711234911792015, + "grad_norm": 1.4778633117675781, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8841403126716614, + "num_tokens": 335968022.0, + "step": 9215 + }, + { + "epoch": 1.7114206128133704, + "grad_norm": 1.399139642715454, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8958158493041992, + "num_tokens": 336006089.0, + "step": 9216 + }, + { + "epoch": 1.711606313834726, + "grad_norm": 1.4365326166152954, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8835911750793457, + "num_tokens": 336047380.0, + "step": 9217 + }, + { + "epoch": 1.7117920148560817, + "grad_norm": 1.5552729368209839, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8891103863716125, + "num_tokens": 336080875.0, + "step": 9218 + }, + { + "epoch": 1.7119777158774374, + "grad_norm": 1.5681082010269165, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8648198843002319, + "num_tokens": 336118561.0, + "step": 9219 + }, + { + "epoch": 1.712163416898793, + "grad_norm": 1.538429856300354, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8764708638191223, + "num_tokens": 336157107.0, + "step": 9220 + }, + { + "epoch": 1.7123491179201484, + "grad_norm": 1.6340570449829102, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8705720901489258, + "num_tokens": 336193490.0, + "step": 9221 + }, + { + "epoch": 1.7125348189415042, + "grad_norm": 1.5941816568374634, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8881773948669434, + "num_tokens": 336227278.0, + "step": 9222 + }, + { + "epoch": 1.71272051996286, + "grad_norm": 1.500327229499817, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8895314931869507, + "num_tokens": 336262296.0, + "step": 9223 + }, + { + "epoch": 1.7129062209842154, + "grad_norm": 1.6044360399246216, + "learning_rate": 1e-06, + "loss": 0.2646, + "mean_token_accuracy": 0.9051477313041687, + "num_tokens": 336293655.0, + "step": 9224 + }, + { + "epoch": 1.713091922005571, + "grad_norm": 1.5014759302139282, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.879153847694397, + "num_tokens": 336330627.0, + "step": 9225 + }, + { + "epoch": 1.7132776230269267, + "grad_norm": 1.6587978601455688, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8738975524902344, + "num_tokens": 336363125.0, + "step": 9226 + }, + { + "epoch": 1.7134633240482824, + "grad_norm": 1.4362858533859253, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8667908906936646, + "num_tokens": 336407403.0, + "step": 9227 + }, + { + "epoch": 1.713649025069638, + "grad_norm": 1.4388563632965088, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8751035928726196, + "num_tokens": 336452502.0, + "step": 9228 + }, + { + "epoch": 1.7138347260909934, + "grad_norm": 1.5033526420593262, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8934590816497803, + "num_tokens": 336487492.0, + "step": 9229 + }, + { + "epoch": 1.7140204271123491, + "grad_norm": 1.6228610277175903, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8834046125411987, + "num_tokens": 336523565.0, + "step": 9230 + }, + { + "epoch": 1.7142061281337049, + "grad_norm": 1.7704050540924072, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8703441023826599, + "num_tokens": 336554524.0, + "step": 9231 + }, + { + "epoch": 1.7143918291550604, + "grad_norm": 1.6361297369003296, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8740548491477966, + "num_tokens": 336589181.0, + "step": 9232 + }, + { + "epoch": 1.714577530176416, + "grad_norm": 1.5919817686080933, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8723021745681763, + "num_tokens": 336622202.0, + "step": 9233 + }, + { + "epoch": 1.7147632311977716, + "grad_norm": 1.4441702365875244, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8806231617927551, + "num_tokens": 336663445.0, + "step": 9234 + }, + { + "epoch": 1.7149489322191274, + "grad_norm": 1.516339659690857, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8843247890472412, + "num_tokens": 336704398.0, + "step": 9235 + }, + { + "epoch": 1.7151346332404829, + "grad_norm": 1.6769945621490479, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8621706962585449, + "num_tokens": 336737994.0, + "step": 9236 + }, + { + "epoch": 1.7153203342618384, + "grad_norm": 1.4071991443634033, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.875607967376709, + "num_tokens": 336783207.0, + "step": 9237 + }, + { + "epoch": 1.7155060352831941, + "grad_norm": 1.5601553916931152, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8794834017753601, + "num_tokens": 336816772.0, + "step": 9238 + }, + { + "epoch": 1.7156917363045496, + "grad_norm": 1.4338465929031372, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8916316032409668, + "num_tokens": 336853049.0, + "step": 9239 + }, + { + "epoch": 1.7158774373259051, + "grad_norm": 1.6863847970962524, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8705928325653076, + "num_tokens": 336886234.0, + "step": 9240 + }, + { + "epoch": 1.7160631383472609, + "grad_norm": 1.596379280090332, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8873269557952881, + "num_tokens": 336924193.0, + "step": 9241 + }, + { + "epoch": 1.7162488393686166, + "grad_norm": 1.6542459726333618, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8581737875938416, + "num_tokens": 336962005.0, + "step": 9242 + }, + { + "epoch": 1.7164345403899721, + "grad_norm": 1.4857202768325806, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8871405124664307, + "num_tokens": 336998332.0, + "step": 9243 + }, + { + "epoch": 1.7166202414113276, + "grad_norm": 1.351119041442871, + "learning_rate": 1e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.8963714838027954, + "num_tokens": 337037026.0, + "step": 9244 + }, + { + "epoch": 1.7168059424326834, + "grad_norm": 1.454574465751648, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8927982449531555, + "num_tokens": 337074532.0, + "step": 9245 + }, + { + "epoch": 1.716991643454039, + "grad_norm": 1.6139943599700928, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8753739595413208, + "num_tokens": 337112528.0, + "step": 9246 + }, + { + "epoch": 1.7171773444753946, + "grad_norm": 1.6520906686782837, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8589786291122437, + "num_tokens": 337148602.0, + "step": 9247 + }, + { + "epoch": 1.7173630454967501, + "grad_norm": 1.391402006149292, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8834144473075867, + "num_tokens": 337189289.0, + "step": 9248 + }, + { + "epoch": 1.7175487465181059, + "grad_norm": 1.603434681892395, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8774173259735107, + "num_tokens": 337225517.0, + "step": 9249 + }, + { + "epoch": 1.7177344475394616, + "grad_norm": 1.453517198562622, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8828251361846924, + "num_tokens": 337263305.0, + "step": 9250 + }, + { + "epoch": 1.717920148560817, + "grad_norm": 1.6162726879119873, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.865955114364624, + "num_tokens": 337301315.0, + "step": 9251 + }, + { + "epoch": 1.7181058495821726, + "grad_norm": 1.4772337675094604, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8771106004714966, + "num_tokens": 337339255.0, + "step": 9252 + }, + { + "epoch": 1.7182915506035283, + "grad_norm": 1.5495436191558838, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8814239501953125, + "num_tokens": 337373324.0, + "step": 9253 + }, + { + "epoch": 1.718477251624884, + "grad_norm": 1.8657958507537842, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8842127919197083, + "num_tokens": 337400564.0, + "step": 9254 + }, + { + "epoch": 1.7186629526462396, + "grad_norm": 1.5631136894226074, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.858063280582428, + "num_tokens": 337438824.0, + "step": 9255 + }, + { + "epoch": 1.718848653667595, + "grad_norm": 1.6091771125793457, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8809373378753662, + "num_tokens": 337470923.0, + "step": 9256 + }, + { + "epoch": 1.7190343546889508, + "grad_norm": 1.485809564590454, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8921076059341431, + "num_tokens": 337504612.0, + "step": 9257 + }, + { + "epoch": 1.7192200557103066, + "grad_norm": 1.535349726676941, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8797661066055298, + "num_tokens": 337542059.0, + "step": 9258 + }, + { + "epoch": 1.719405756731662, + "grad_norm": 1.5638318061828613, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.86497962474823, + "num_tokens": 337579873.0, + "step": 9259 + }, + { + "epoch": 1.7195914577530176, + "grad_norm": 1.5234768390655518, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8777337670326233, + "num_tokens": 337617416.0, + "step": 9260 + }, + { + "epoch": 1.7197771587743733, + "grad_norm": 1.646958827972412, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8753248453140259, + "num_tokens": 337648691.0, + "step": 9261 + }, + { + "epoch": 1.7199628597957288, + "grad_norm": 1.5362178087234497, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8883916139602661, + "num_tokens": 337685360.0, + "step": 9262 + }, + { + "epoch": 1.7201485608170843, + "grad_norm": 1.5539523363113403, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8802045583724976, + "num_tokens": 337719717.0, + "step": 9263 + }, + { + "epoch": 1.72033426183844, + "grad_norm": 1.6022039651870728, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8785794973373413, + "num_tokens": 337757158.0, + "step": 9264 + }, + { + "epoch": 1.7205199628597958, + "grad_norm": 1.757848858833313, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8875527381896973, + "num_tokens": 337788385.0, + "step": 9265 + }, + { + "epoch": 1.7207056638811513, + "grad_norm": 1.7627320289611816, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8729290962219238, + "num_tokens": 337821134.0, + "step": 9266 + }, + { + "epoch": 1.7208913649025068, + "grad_norm": 1.6970018148422241, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8809633255004883, + "num_tokens": 337852319.0, + "step": 9267 + }, + { + "epoch": 1.7210770659238626, + "grad_norm": 1.5240576267242432, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8825902938842773, + "num_tokens": 337888308.0, + "step": 9268 + }, + { + "epoch": 1.7212627669452183, + "grad_norm": 1.5634479522705078, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.872509241104126, + "num_tokens": 337924970.0, + "step": 9269 + }, + { + "epoch": 1.7214484679665738, + "grad_norm": 1.4558128118515015, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8773869276046753, + "num_tokens": 337967406.0, + "step": 9270 + }, + { + "epoch": 1.7216341689879293, + "grad_norm": 1.513238549232483, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8753702640533447, + "num_tokens": 338006535.0, + "step": 9271 + }, + { + "epoch": 1.721819870009285, + "grad_norm": 1.4554001092910767, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8820555210113525, + "num_tokens": 338043492.0, + "step": 9272 + }, + { + "epoch": 1.7220055710306408, + "grad_norm": 1.567455768585205, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8748411536216736, + "num_tokens": 338078632.0, + "step": 9273 + }, + { + "epoch": 1.7221912720519963, + "grad_norm": 1.632380485534668, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8635736703872681, + "num_tokens": 338113733.0, + "step": 9274 + }, + { + "epoch": 1.7223769730733518, + "grad_norm": 1.69809091091156, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8619227409362793, + "num_tokens": 338151152.0, + "step": 9275 + }, + { + "epoch": 1.7225626740947075, + "grad_norm": 1.42984139919281, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8709065914154053, + "num_tokens": 338191417.0, + "step": 9276 + }, + { + "epoch": 1.7227483751160633, + "grad_norm": 1.391934871673584, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8914045691490173, + "num_tokens": 338231561.0, + "step": 9277 + }, + { + "epoch": 1.7229340761374188, + "grad_norm": 1.550199031829834, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8766161203384399, + "num_tokens": 338270582.0, + "step": 9278 + }, + { + "epoch": 1.7231197771587743, + "grad_norm": 1.4727210998535156, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.882831335067749, + "num_tokens": 338306042.0, + "step": 9279 + }, + { + "epoch": 1.72330547818013, + "grad_norm": 1.4553340673446655, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8764746785163879, + "num_tokens": 338344520.0, + "step": 9280 + }, + { + "epoch": 1.7234911792014858, + "grad_norm": 1.538162350654602, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8780467510223389, + "num_tokens": 338381669.0, + "step": 9281 + }, + { + "epoch": 1.7236768802228413, + "grad_norm": 1.5697640180587769, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8614813089370728, + "num_tokens": 338418255.0, + "step": 9282 + }, + { + "epoch": 1.7238625812441968, + "grad_norm": 1.4926996231079102, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.886879563331604, + "num_tokens": 338456103.0, + "step": 9283 + }, + { + "epoch": 1.7240482822655525, + "grad_norm": 1.5195956230163574, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8664026260375977, + "num_tokens": 338495577.0, + "step": 9284 + }, + { + "epoch": 1.724233983286908, + "grad_norm": 1.5356292724609375, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8726192712783813, + "num_tokens": 338533598.0, + "step": 9285 + }, + { + "epoch": 1.7244196843082635, + "grad_norm": 1.5029629468917847, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8770866990089417, + "num_tokens": 338571576.0, + "step": 9286 + }, + { + "epoch": 1.7246053853296193, + "grad_norm": 1.4609324932098389, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8937286138534546, + "num_tokens": 338605665.0, + "step": 9287 + }, + { + "epoch": 1.724791086350975, + "grad_norm": 1.6856263875961304, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8735133409500122, + "num_tokens": 338636825.0, + "step": 9288 + }, + { + "epoch": 1.7249767873723305, + "grad_norm": 1.6069172620773315, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.874109148979187, + "num_tokens": 338676573.0, + "step": 9289 + }, + { + "epoch": 1.725162488393686, + "grad_norm": 1.574195384979248, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8814906477928162, + "num_tokens": 338710759.0, + "step": 9290 + }, + { + "epoch": 1.7253481894150418, + "grad_norm": 1.515209436416626, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8785587549209595, + "num_tokens": 338746923.0, + "step": 9291 + }, + { + "epoch": 1.7255338904363975, + "grad_norm": 1.5339641571044922, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8744186162948608, + "num_tokens": 338781746.0, + "step": 9292 + }, + { + "epoch": 1.725719591457753, + "grad_norm": 1.5446488857269287, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8808764219284058, + "num_tokens": 338817054.0, + "step": 9293 + }, + { + "epoch": 1.7259052924791085, + "grad_norm": 1.560383915901184, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8728357553482056, + "num_tokens": 338855833.0, + "step": 9294 + }, + { + "epoch": 1.7260909935004642, + "grad_norm": 1.5650120973587036, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8795968294143677, + "num_tokens": 338893839.0, + "step": 9295 + }, + { + "epoch": 1.72627669452182, + "grad_norm": 1.5177996158599854, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8872089385986328, + "num_tokens": 338934215.0, + "step": 9296 + }, + { + "epoch": 1.7264623955431755, + "grad_norm": 1.4874407052993774, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8769837617874146, + "num_tokens": 338974441.0, + "step": 9297 + }, + { + "epoch": 1.726648096564531, + "grad_norm": 1.6488583087921143, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8745942115783691, + "num_tokens": 339007828.0, + "step": 9298 + }, + { + "epoch": 1.7268337975858867, + "grad_norm": 1.6197890043258667, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8720324039459229, + "num_tokens": 339040605.0, + "step": 9299 + }, + { + "epoch": 1.7270194986072425, + "grad_norm": 1.746370792388916, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8739597201347351, + "num_tokens": 339068300.0, + "step": 9300 + }, + { + "epoch": 1.727205199628598, + "grad_norm": 1.8077634572982788, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8689365386962891, + "num_tokens": 339101958.0, + "step": 9301 + }, + { + "epoch": 1.7273909006499535, + "grad_norm": 1.6384600400924683, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8807525634765625, + "num_tokens": 339140375.0, + "step": 9302 + }, + { + "epoch": 1.7275766016713092, + "grad_norm": 1.4958308935165405, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8916240930557251, + "num_tokens": 339175148.0, + "step": 9303 + }, + { + "epoch": 1.727762302692665, + "grad_norm": 1.5911778211593628, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8760093450546265, + "num_tokens": 339211470.0, + "step": 9304 + }, + { + "epoch": 1.7279480037140205, + "grad_norm": 1.4572315216064453, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8835195302963257, + "num_tokens": 339249091.0, + "step": 9305 + }, + { + "epoch": 1.728133704735376, + "grad_norm": 1.3505927324295044, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8876392245292664, + "num_tokens": 339289720.0, + "step": 9306 + }, + { + "epoch": 1.7283194057567317, + "grad_norm": 1.381062626838684, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.877021074295044, + "num_tokens": 339331334.0, + "step": 9307 + }, + { + "epoch": 1.7285051067780874, + "grad_norm": 1.5698914527893066, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8768028020858765, + "num_tokens": 339367381.0, + "step": 9308 + }, + { + "epoch": 1.728690807799443, + "grad_norm": 1.5109339952468872, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8854471445083618, + "num_tokens": 339403744.0, + "step": 9309 + }, + { + "epoch": 1.7288765088207985, + "grad_norm": 1.5889325141906738, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8921281099319458, + "num_tokens": 339434541.0, + "step": 9310 + }, + { + "epoch": 1.7290622098421542, + "grad_norm": 1.446015477180481, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8861825466156006, + "num_tokens": 339473656.0, + "step": 9311 + }, + { + "epoch": 1.7292479108635097, + "grad_norm": 1.4559706449508667, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8790613412857056, + "num_tokens": 339513366.0, + "step": 9312 + }, + { + "epoch": 1.7294336118848652, + "grad_norm": 1.5188556909561157, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8794096112251282, + "num_tokens": 339548418.0, + "step": 9313 + }, + { + "epoch": 1.729619312906221, + "grad_norm": 1.660258173942566, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8594970107078552, + "num_tokens": 339582037.0, + "step": 9314 + }, + { + "epoch": 1.7298050139275767, + "grad_norm": 1.4042919874191284, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8779580593109131, + "num_tokens": 339622398.0, + "step": 9315 + }, + { + "epoch": 1.7299907149489322, + "grad_norm": 1.4369169473648071, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8778282403945923, + "num_tokens": 339664423.0, + "step": 9316 + }, + { + "epoch": 1.7301764159702877, + "grad_norm": 1.616506814956665, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8757811784744263, + "num_tokens": 339696221.0, + "step": 9317 + }, + { + "epoch": 1.7303621169916434, + "grad_norm": 1.5013176202774048, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8818113803863525, + "num_tokens": 339733343.0, + "step": 9318 + }, + { + "epoch": 1.7305478180129992, + "grad_norm": 1.458992600440979, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8783947825431824, + "num_tokens": 339769762.0, + "step": 9319 + }, + { + "epoch": 1.7307335190343547, + "grad_norm": 1.5286872386932373, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.854966938495636, + "num_tokens": 339811831.0, + "step": 9320 + }, + { + "epoch": 1.7309192200557102, + "grad_norm": 1.571424126625061, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.877802848815918, + "num_tokens": 339847383.0, + "step": 9321 + }, + { + "epoch": 1.731104921077066, + "grad_norm": 1.6489317417144775, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8696361780166626, + "num_tokens": 339879545.0, + "step": 9322 + }, + { + "epoch": 1.7312906220984217, + "grad_norm": 1.4899556636810303, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8646193742752075, + "num_tokens": 339917750.0, + "step": 9323 + }, + { + "epoch": 1.7314763231197772, + "grad_norm": 1.3822147846221924, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8909240365028381, + "num_tokens": 339955326.0, + "step": 9324 + }, + { + "epoch": 1.7316620241411327, + "grad_norm": 1.4299613237380981, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8926914930343628, + "num_tokens": 339989323.0, + "step": 9325 + }, + { + "epoch": 1.7318477251624884, + "grad_norm": 1.514243483543396, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8757513761520386, + "num_tokens": 340028137.0, + "step": 9326 + }, + { + "epoch": 1.7320334261838441, + "grad_norm": 1.4877257347106934, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8853073120117188, + "num_tokens": 340067758.0, + "step": 9327 + }, + { + "epoch": 1.7322191272051997, + "grad_norm": 1.6376895904541016, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8736791610717773, + "num_tokens": 340101909.0, + "step": 9328 + }, + { + "epoch": 1.7324048282265552, + "grad_norm": 1.5726946592330933, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.8983584046363831, + "num_tokens": 340131233.0, + "step": 9329 + }, + { + "epoch": 1.732590529247911, + "grad_norm": 1.611629605293274, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8814499378204346, + "num_tokens": 340165003.0, + "step": 9330 + }, + { + "epoch": 1.7327762302692666, + "grad_norm": 1.44145667552948, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8855351209640503, + "num_tokens": 340208796.0, + "step": 9331 + }, + { + "epoch": 1.7329619312906221, + "grad_norm": 1.6769481897354126, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8866838812828064, + "num_tokens": 340246387.0, + "step": 9332 + }, + { + "epoch": 1.7331476323119777, + "grad_norm": 1.7080612182617188, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8772459030151367, + "num_tokens": 340277542.0, + "step": 9333 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.5224733352661133, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8714499473571777, + "num_tokens": 340317946.0, + "step": 9334 + }, + { + "epoch": 1.733519034354689, + "grad_norm": 1.5050158500671387, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8724541068077087, + "num_tokens": 340353903.0, + "step": 9335 + }, + { + "epoch": 1.7337047353760444, + "grad_norm": 1.5205771923065186, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8747977614402771, + "num_tokens": 340398518.0, + "step": 9336 + }, + { + "epoch": 1.7338904363974001, + "grad_norm": 1.5342087745666504, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8852689266204834, + "num_tokens": 340435672.0, + "step": 9337 + }, + { + "epoch": 1.7340761374187559, + "grad_norm": 1.4824141263961792, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8781678080558777, + "num_tokens": 340476829.0, + "step": 9338 + }, + { + "epoch": 1.7342618384401114, + "grad_norm": 1.5037058591842651, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.872022807598114, + "num_tokens": 340515121.0, + "step": 9339 + }, + { + "epoch": 1.734447539461467, + "grad_norm": 1.6717833280563354, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8834255933761597, + "num_tokens": 340546975.0, + "step": 9340 + }, + { + "epoch": 1.7346332404828226, + "grad_norm": 1.4476865530014038, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8843972682952881, + "num_tokens": 340584599.0, + "step": 9341 + }, + { + "epoch": 1.7348189415041784, + "grad_norm": 1.4938533306121826, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8675328493118286, + "num_tokens": 340624480.0, + "step": 9342 + }, + { + "epoch": 1.7350046425255339, + "grad_norm": 1.4788888692855835, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8805041313171387, + "num_tokens": 340662145.0, + "step": 9343 + }, + { + "epoch": 1.7351903435468894, + "grad_norm": 1.526808500289917, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8729104399681091, + "num_tokens": 340699482.0, + "step": 9344 + }, + { + "epoch": 1.7353760445682451, + "grad_norm": 1.492699146270752, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8863164782524109, + "num_tokens": 340733613.0, + "step": 9345 + }, + { + "epoch": 1.7355617455896009, + "grad_norm": 1.6693071126937866, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8675199151039124, + "num_tokens": 340767333.0, + "step": 9346 + }, + { + "epoch": 1.7357474466109564, + "grad_norm": 1.5325478315353394, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8758366107940674, + "num_tokens": 340801996.0, + "step": 9347 + }, + { + "epoch": 1.7359331476323119, + "grad_norm": 1.5695418119430542, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.873481273651123, + "num_tokens": 340837827.0, + "step": 9348 + }, + { + "epoch": 1.7361188486536676, + "grad_norm": 1.5256495475769043, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8712368607521057, + "num_tokens": 340876819.0, + "step": 9349 + }, + { + "epoch": 1.7363045496750233, + "grad_norm": 1.4148633480072021, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8839855790138245, + "num_tokens": 340917485.0, + "step": 9350 + }, + { + "epoch": 1.7364902506963789, + "grad_norm": 1.5177518129348755, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8889009952545166, + "num_tokens": 340952771.0, + "step": 9351 + }, + { + "epoch": 1.7366759517177344, + "grad_norm": 1.4904425144195557, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.897120475769043, + "num_tokens": 340989438.0, + "step": 9352 + }, + { + "epoch": 1.73686165273909, + "grad_norm": 1.4406901597976685, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8665851354598999, + "num_tokens": 341029126.0, + "step": 9353 + }, + { + "epoch": 1.7370473537604458, + "grad_norm": 1.5835648775100708, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8813585638999939, + "num_tokens": 341060479.0, + "step": 9354 + }, + { + "epoch": 1.7372330547818013, + "grad_norm": 1.5510667562484741, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8715609312057495, + "num_tokens": 341097490.0, + "step": 9355 + }, + { + "epoch": 1.7374187558031569, + "grad_norm": 1.5386239290237427, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.870829164981842, + "num_tokens": 341132122.0, + "step": 9356 + }, + { + "epoch": 1.7376044568245126, + "grad_norm": 1.5021531581878662, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8705284595489502, + "num_tokens": 341172094.0, + "step": 9357 + }, + { + "epoch": 1.737790157845868, + "grad_norm": 1.5903364419937134, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.875251293182373, + "num_tokens": 341205994.0, + "step": 9358 + }, + { + "epoch": 1.7379758588672236, + "grad_norm": 1.5304471254348755, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8808766603469849, + "num_tokens": 341240207.0, + "step": 9359 + }, + { + "epoch": 1.7381615598885793, + "grad_norm": 1.5300041437149048, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8693022131919861, + "num_tokens": 341277584.0, + "step": 9360 + }, + { + "epoch": 1.738347260909935, + "grad_norm": 1.542301058769226, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8787667155265808, + "num_tokens": 341312746.0, + "step": 9361 + }, + { + "epoch": 1.7385329619312906, + "grad_norm": 1.689898133277893, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.869349479675293, + "num_tokens": 341346114.0, + "step": 9362 + }, + { + "epoch": 1.738718662952646, + "grad_norm": 1.4790043830871582, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8655626177787781, + "num_tokens": 341386310.0, + "step": 9363 + }, + { + "epoch": 1.7389043639740018, + "grad_norm": 1.508548378944397, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8703541159629822, + "num_tokens": 341425494.0, + "step": 9364 + }, + { + "epoch": 1.7390900649953576, + "grad_norm": 1.4902904033660889, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8823277950286865, + "num_tokens": 341461899.0, + "step": 9365 + }, + { + "epoch": 1.739275766016713, + "grad_norm": 1.5251872539520264, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8911550045013428, + "num_tokens": 341495746.0, + "step": 9366 + }, + { + "epoch": 1.7394614670380686, + "grad_norm": 1.547774314880371, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8714125752449036, + "num_tokens": 341532416.0, + "step": 9367 + }, + { + "epoch": 1.7396471680594243, + "grad_norm": 1.5776729583740234, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8695036172866821, + "num_tokens": 341569485.0, + "step": 9368 + }, + { + "epoch": 1.73983286908078, + "grad_norm": 1.6912935972213745, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8741313219070435, + "num_tokens": 341601315.0, + "step": 9369 + }, + { + "epoch": 1.7400185701021356, + "grad_norm": 1.6012297868728638, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.888579249382019, + "num_tokens": 341633699.0, + "step": 9370 + }, + { + "epoch": 1.740204271123491, + "grad_norm": 1.4367727041244507, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8936094045639038, + "num_tokens": 341669285.0, + "step": 9371 + }, + { + "epoch": 1.7403899721448468, + "grad_norm": 1.4300367832183838, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8766958713531494, + "num_tokens": 341711135.0, + "step": 9372 + }, + { + "epoch": 1.7405756731662025, + "grad_norm": 1.4841530323028564, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8808937072753906, + "num_tokens": 341749000.0, + "step": 9373 + }, + { + "epoch": 1.740761374187558, + "grad_norm": 1.5152217149734497, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8806545734405518, + "num_tokens": 341784404.0, + "step": 9374 + }, + { + "epoch": 1.7409470752089136, + "grad_norm": 1.380568265914917, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8848150968551636, + "num_tokens": 341826382.0, + "step": 9375 + }, + { + "epoch": 1.7411327762302693, + "grad_norm": 1.6334086656570435, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8680042028427124, + "num_tokens": 341866181.0, + "step": 9376 + }, + { + "epoch": 1.741318477251625, + "grad_norm": 1.517330527305603, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8808497190475464, + "num_tokens": 341905359.0, + "step": 9377 + }, + { + "epoch": 1.7415041782729805, + "grad_norm": 1.5635640621185303, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8711525201797485, + "num_tokens": 341942773.0, + "step": 9378 + }, + { + "epoch": 1.741689879294336, + "grad_norm": 1.4050164222717285, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.877228856086731, + "num_tokens": 341985892.0, + "step": 9379 + }, + { + "epoch": 1.7418755803156918, + "grad_norm": 1.501920461654663, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8826070427894592, + "num_tokens": 342022003.0, + "step": 9380 + }, + { + "epoch": 1.7420612813370475, + "grad_norm": 1.7165037393569946, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8764247298240662, + "num_tokens": 342052178.0, + "step": 9381 + }, + { + "epoch": 1.7422469823584028, + "grad_norm": 1.5542354583740234, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.891119658946991, + "num_tokens": 342089068.0, + "step": 9382 + }, + { + "epoch": 1.7424326833797585, + "grad_norm": 1.6358271837234497, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8608090877532959, + "num_tokens": 342125399.0, + "step": 9383 + }, + { + "epoch": 1.7426183844011143, + "grad_norm": 1.820008397102356, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8637274503707886, + "num_tokens": 342154269.0, + "step": 9384 + }, + { + "epoch": 1.7428040854224698, + "grad_norm": 1.4187986850738525, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.885877251625061, + "num_tokens": 342193100.0, + "step": 9385 + }, + { + "epoch": 1.7429897864438253, + "grad_norm": 1.5645993947982788, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8667495846748352, + "num_tokens": 342233440.0, + "step": 9386 + }, + { + "epoch": 1.743175487465181, + "grad_norm": 1.4631617069244385, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8879652619361877, + "num_tokens": 342274113.0, + "step": 9387 + }, + { + "epoch": 1.7433611884865368, + "grad_norm": 1.4856053590774536, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8880467414855957, + "num_tokens": 342308826.0, + "step": 9388 + }, + { + "epoch": 1.7435468895078923, + "grad_norm": 1.489727258682251, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8900820016860962, + "num_tokens": 342345789.0, + "step": 9389 + }, + { + "epoch": 1.7437325905292478, + "grad_norm": 1.7082247734069824, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8809676766395569, + "num_tokens": 342375720.0, + "step": 9390 + }, + { + "epoch": 1.7439182915506035, + "grad_norm": 1.4826196432113647, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8757553100585938, + "num_tokens": 342415750.0, + "step": 9391 + }, + { + "epoch": 1.7441039925719592, + "grad_norm": 1.5571473836898804, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8749150037765503, + "num_tokens": 342452398.0, + "step": 9392 + }, + { + "epoch": 1.7442896935933148, + "grad_norm": 1.714577317237854, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8718585968017578, + "num_tokens": 342483862.0, + "step": 9393 + }, + { + "epoch": 1.7444753946146703, + "grad_norm": 1.5893032550811768, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8661208152770996, + "num_tokens": 342520846.0, + "step": 9394 + }, + { + "epoch": 1.744661095636026, + "grad_norm": 1.5913127660751343, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8747376203536987, + "num_tokens": 342557677.0, + "step": 9395 + }, + { + "epoch": 1.7448467966573817, + "grad_norm": 1.510067105293274, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8793282508850098, + "num_tokens": 342597023.0, + "step": 9396 + }, + { + "epoch": 1.7450324976787372, + "grad_norm": 1.4098401069641113, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8932192325592041, + "num_tokens": 342633365.0, + "step": 9397 + }, + { + "epoch": 1.7452181987000928, + "grad_norm": 1.4991992712020874, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8833367228507996, + "num_tokens": 342670417.0, + "step": 9398 + }, + { + "epoch": 1.7454038997214485, + "grad_norm": 1.6125670671463013, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8783774971961975, + "num_tokens": 342707675.0, + "step": 9399 + }, + { + "epoch": 1.7455896007428042, + "grad_norm": 1.5874308347702026, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8720421195030212, + "num_tokens": 342742508.0, + "step": 9400 + }, + { + "epoch": 1.7457753017641597, + "grad_norm": 1.549901008605957, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8712195754051208, + "num_tokens": 342781284.0, + "step": 9401 + }, + { + "epoch": 1.7459610027855152, + "grad_norm": 1.6387407779693604, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8908350467681885, + "num_tokens": 342813226.0, + "step": 9402 + }, + { + "epoch": 1.746146703806871, + "grad_norm": 1.585891842842102, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8784716129302979, + "num_tokens": 342845928.0, + "step": 9403 + }, + { + "epoch": 1.7463324048282267, + "grad_norm": 1.524147391319275, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8716965913772583, + "num_tokens": 342885185.0, + "step": 9404 + }, + { + "epoch": 1.7465181058495822, + "grad_norm": 1.518847107887268, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8869820833206177, + "num_tokens": 342921579.0, + "step": 9405 + }, + { + "epoch": 1.7467038068709377, + "grad_norm": 1.57274329662323, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8767448663711548, + "num_tokens": 342957091.0, + "step": 9406 + }, + { + "epoch": 1.7468895078922935, + "grad_norm": 1.4411509037017822, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.886072039604187, + "num_tokens": 342994882.0, + "step": 9407 + }, + { + "epoch": 1.747075208913649, + "grad_norm": 1.5501981973648071, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8762948513031006, + "num_tokens": 343032744.0, + "step": 9408 + }, + { + "epoch": 1.7472609099350045, + "grad_norm": 1.431391954421997, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8682719469070435, + "num_tokens": 343077162.0, + "step": 9409 + }, + { + "epoch": 1.7474466109563602, + "grad_norm": 1.441118836402893, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8849111795425415, + "num_tokens": 343119687.0, + "step": 9410 + }, + { + "epoch": 1.747632311977716, + "grad_norm": 1.478080153465271, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8809549808502197, + "num_tokens": 343157108.0, + "step": 9411 + }, + { + "epoch": 1.7478180129990715, + "grad_norm": 1.4590214490890503, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8769720792770386, + "num_tokens": 343197777.0, + "step": 9412 + }, + { + "epoch": 1.748003714020427, + "grad_norm": 1.5417864322662354, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8818647861480713, + "num_tokens": 343231624.0, + "step": 9413 + }, + { + "epoch": 1.7481894150417827, + "grad_norm": 1.5039314031600952, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8891878128051758, + "num_tokens": 343266529.0, + "step": 9414 + }, + { + "epoch": 1.7483751160631384, + "grad_norm": 1.5380326509475708, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8754675388336182, + "num_tokens": 343306228.0, + "step": 9415 + }, + { + "epoch": 1.748560817084494, + "grad_norm": 1.5443384647369385, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8853167295455933, + "num_tokens": 343341769.0, + "step": 9416 + }, + { + "epoch": 1.7487465181058495, + "grad_norm": 1.4465687274932861, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8814636468887329, + "num_tokens": 343381304.0, + "step": 9417 + }, + { + "epoch": 1.7489322191272052, + "grad_norm": 1.5562995672225952, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8834244608879089, + "num_tokens": 343416892.0, + "step": 9418 + }, + { + "epoch": 1.749117920148561, + "grad_norm": 1.4827828407287598, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8700603246688843, + "num_tokens": 343458415.0, + "step": 9419 + }, + { + "epoch": 1.7493036211699164, + "grad_norm": 1.4079869985580444, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8809754252433777, + "num_tokens": 343500557.0, + "step": 9420 + }, + { + "epoch": 1.749489322191272, + "grad_norm": 1.634629487991333, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.87926185131073, + "num_tokens": 343534250.0, + "step": 9421 + }, + { + "epoch": 1.7496750232126277, + "grad_norm": 1.5229980945587158, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8790236115455627, + "num_tokens": 343571448.0, + "step": 9422 + }, + { + "epoch": 1.7498607242339834, + "grad_norm": 1.5291287899017334, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8816860914230347, + "num_tokens": 343607289.0, + "step": 9423 + }, + { + "epoch": 1.750046425255339, + "grad_norm": 1.482387900352478, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8787766695022583, + "num_tokens": 343645203.0, + "step": 9424 + }, + { + "epoch": 1.7502321262766944, + "grad_norm": 1.5772340297698975, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8939765691757202, + "num_tokens": 343676718.0, + "step": 9425 + }, + { + "epoch": 1.7504178272980502, + "grad_norm": 1.5700715780258179, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8840576410293579, + "num_tokens": 343713006.0, + "step": 9426 + }, + { + "epoch": 1.750603528319406, + "grad_norm": 1.3982105255126953, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8876772522926331, + "num_tokens": 343749816.0, + "step": 9427 + }, + { + "epoch": 1.7507892293407614, + "grad_norm": 1.4904844760894775, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8547974824905396, + "num_tokens": 343792067.0, + "step": 9428 + }, + { + "epoch": 1.750974930362117, + "grad_norm": 1.5130529403686523, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8821709752082825, + "num_tokens": 343828258.0, + "step": 9429 + }, + { + "epoch": 1.7511606313834727, + "grad_norm": 1.5321471691131592, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8776273727416992, + "num_tokens": 343866019.0, + "step": 9430 + }, + { + "epoch": 1.7513463324048282, + "grad_norm": 1.7441290616989136, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8736293911933899, + "num_tokens": 343900247.0, + "step": 9431 + }, + { + "epoch": 1.7515320334261837, + "grad_norm": 1.621621012687683, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8865219950675964, + "num_tokens": 343934040.0, + "step": 9432 + }, + { + "epoch": 1.7517177344475394, + "grad_norm": 1.5426380634307861, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.870419979095459, + "num_tokens": 343972220.0, + "step": 9433 + }, + { + "epoch": 1.7519034354688952, + "grad_norm": 1.4358304738998413, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8913593292236328, + "num_tokens": 344011398.0, + "step": 9434 + }, + { + "epoch": 1.7520891364902507, + "grad_norm": 1.4730165004730225, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8864213824272156, + "num_tokens": 344049401.0, + "step": 9435 + }, + { + "epoch": 1.7522748375116062, + "grad_norm": 1.6144016981124878, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8647606372833252, + "num_tokens": 344086535.0, + "step": 9436 + }, + { + "epoch": 1.752460538532962, + "grad_norm": 1.4639201164245605, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.883242666721344, + "num_tokens": 344123525.0, + "step": 9437 + }, + { + "epoch": 1.7526462395543176, + "grad_norm": 1.5165661573410034, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8763529062271118, + "num_tokens": 344162622.0, + "step": 9438 + }, + { + "epoch": 1.7528319405756732, + "grad_norm": 1.6762847900390625, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.871940016746521, + "num_tokens": 344197194.0, + "step": 9439 + }, + { + "epoch": 1.7530176415970287, + "grad_norm": 1.5743060111999512, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8797763586044312, + "num_tokens": 344234272.0, + "step": 9440 + }, + { + "epoch": 1.7532033426183844, + "grad_norm": 1.5555697679519653, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8835123181343079, + "num_tokens": 344268094.0, + "step": 9441 + }, + { + "epoch": 1.7533890436397401, + "grad_norm": 1.4850105047225952, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8932226896286011, + "num_tokens": 344304674.0, + "step": 9442 + }, + { + "epoch": 1.7535747446610956, + "grad_norm": 1.4769489765167236, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8718041181564331, + "num_tokens": 344344007.0, + "step": 9443 + }, + { + "epoch": 1.7537604456824512, + "grad_norm": 1.5121300220489502, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8805525302886963, + "num_tokens": 344381626.0, + "step": 9444 + }, + { + "epoch": 1.7539461467038069, + "grad_norm": 1.5879945755004883, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8884526491165161, + "num_tokens": 344414428.0, + "step": 9445 + }, + { + "epoch": 1.7541318477251626, + "grad_norm": 1.4577877521514893, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8896379470825195, + "num_tokens": 344450846.0, + "step": 9446 + }, + { + "epoch": 1.7543175487465181, + "grad_norm": 1.6283395290374756, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8893214464187622, + "num_tokens": 344483496.0, + "step": 9447 + }, + { + "epoch": 1.7545032497678736, + "grad_norm": 1.4671566486358643, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8752597570419312, + "num_tokens": 344520418.0, + "step": 9448 + }, + { + "epoch": 1.7546889507892294, + "grad_norm": 1.5081515312194824, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8817583918571472, + "num_tokens": 344556570.0, + "step": 9449 + }, + { + "epoch": 1.754874651810585, + "grad_norm": 1.5999137163162231, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8719593286514282, + "num_tokens": 344593243.0, + "step": 9450 + }, + { + "epoch": 1.7550603528319406, + "grad_norm": 1.520919680595398, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8922085762023926, + "num_tokens": 344630209.0, + "step": 9451 + }, + { + "epoch": 1.7552460538532961, + "grad_norm": 1.5244325399398804, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8880078196525574, + "num_tokens": 344666050.0, + "step": 9452 + }, + { + "epoch": 1.7554317548746519, + "grad_norm": 1.3956378698349, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8822728395462036, + "num_tokens": 344705889.0, + "step": 9453 + }, + { + "epoch": 1.7556174558960074, + "grad_norm": 1.506455421447754, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.879489004611969, + "num_tokens": 344738215.0, + "step": 9454 + }, + { + "epoch": 1.7558031569173629, + "grad_norm": 1.598839521408081, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.887230634689331, + "num_tokens": 344770358.0, + "step": 9455 + }, + { + "epoch": 1.7559888579387186, + "grad_norm": 1.3382288217544556, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.895626425743103, + "num_tokens": 344809911.0, + "step": 9456 + }, + { + "epoch": 1.7561745589600744, + "grad_norm": 1.4577715396881104, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8766931295394897, + "num_tokens": 344849229.0, + "step": 9457 + }, + { + "epoch": 1.7563602599814299, + "grad_norm": 1.5893561840057373, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8851082921028137, + "num_tokens": 344884140.0, + "step": 9458 + }, + { + "epoch": 1.7565459610027854, + "grad_norm": 1.5414073467254639, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8862386345863342, + "num_tokens": 344918700.0, + "step": 9459 + }, + { + "epoch": 1.756731662024141, + "grad_norm": 1.506701946258545, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8729719519615173, + "num_tokens": 344958025.0, + "step": 9460 + }, + { + "epoch": 1.7569173630454968, + "grad_norm": 1.4344120025634766, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8729349374771118, + "num_tokens": 344999319.0, + "step": 9461 + }, + { + "epoch": 1.7571030640668523, + "grad_norm": 1.6502773761749268, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8832062482833862, + "num_tokens": 345031759.0, + "step": 9462 + }, + { + "epoch": 1.7572887650882079, + "grad_norm": 1.562718391418457, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8780124187469482, + "num_tokens": 345066825.0, + "step": 9463 + }, + { + "epoch": 1.7574744661095636, + "grad_norm": 1.6627308130264282, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8645172715187073, + "num_tokens": 345102311.0, + "step": 9464 + }, + { + "epoch": 1.7576601671309193, + "grad_norm": 1.5390920639038086, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8811026811599731, + "num_tokens": 345136435.0, + "step": 9465 + }, + { + "epoch": 1.7578458681522748, + "grad_norm": 1.614297866821289, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8721734285354614, + "num_tokens": 345171558.0, + "step": 9466 + }, + { + "epoch": 1.7580315691736303, + "grad_norm": 1.6272081136703491, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8726006150245667, + "num_tokens": 345208094.0, + "step": 9467 + }, + { + "epoch": 1.758217270194986, + "grad_norm": 1.5953530073165894, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8834227323532104, + "num_tokens": 345246758.0, + "step": 9468 + }, + { + "epoch": 1.7584029712163418, + "grad_norm": 1.3484667539596558, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8905085921287537, + "num_tokens": 345288828.0, + "step": 9469 + }, + { + "epoch": 1.7585886722376973, + "grad_norm": 1.7616575956344604, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8732033967971802, + "num_tokens": 345318347.0, + "step": 9470 + }, + { + "epoch": 1.7587743732590528, + "grad_norm": 1.5457587242126465, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8848743438720703, + "num_tokens": 345355822.0, + "step": 9471 + }, + { + "epoch": 1.7589600742804086, + "grad_norm": 1.5572770833969116, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8855443596839905, + "num_tokens": 345391721.0, + "step": 9472 + }, + { + "epoch": 1.7591457753017643, + "grad_norm": 1.504822015762329, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8780661821365356, + "num_tokens": 345430041.0, + "step": 9473 + }, + { + "epoch": 1.7593314763231198, + "grad_norm": 1.4563533067703247, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8765174150466919, + "num_tokens": 345469450.0, + "step": 9474 + }, + { + "epoch": 1.7595171773444753, + "grad_norm": 1.484981894493103, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8934468030929565, + "num_tokens": 345502809.0, + "step": 9475 + }, + { + "epoch": 1.759702878365831, + "grad_norm": 1.440079927444458, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8661066293716431, + "num_tokens": 345545057.0, + "step": 9476 + }, + { + "epoch": 1.7598885793871868, + "grad_norm": 1.694723129272461, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8840949535369873, + "num_tokens": 345574604.0, + "step": 9477 + }, + { + "epoch": 1.7600742804085423, + "grad_norm": 1.4520015716552734, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8730531930923462, + "num_tokens": 345617364.0, + "step": 9478 + }, + { + "epoch": 1.7602599814298978, + "grad_norm": 1.5808454751968384, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8767541646957397, + "num_tokens": 345650711.0, + "step": 9479 + }, + { + "epoch": 1.7604456824512535, + "grad_norm": 1.5447245836257935, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.868570864200592, + "num_tokens": 345690802.0, + "step": 9480 + }, + { + "epoch": 1.760631383472609, + "grad_norm": 1.5376290082931519, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8857376575469971, + "num_tokens": 345726488.0, + "step": 9481 + }, + { + "epoch": 1.7608170844939646, + "grad_norm": 1.5134754180908203, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8768303394317627, + "num_tokens": 345765963.0, + "step": 9482 + }, + { + "epoch": 1.7610027855153203, + "grad_norm": 1.4626357555389404, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8822473287582397, + "num_tokens": 345807610.0, + "step": 9483 + }, + { + "epoch": 1.761188486536676, + "grad_norm": 1.6738802194595337, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8758290410041809, + "num_tokens": 345845758.0, + "step": 9484 + }, + { + "epoch": 1.7613741875580315, + "grad_norm": 1.4819355010986328, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8732280731201172, + "num_tokens": 345886585.0, + "step": 9485 + }, + { + "epoch": 1.761559888579387, + "grad_norm": 1.4765801429748535, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8850854635238647, + "num_tokens": 345924016.0, + "step": 9486 + }, + { + "epoch": 1.7617455896007428, + "grad_norm": 1.6214778423309326, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8909885883331299, + "num_tokens": 345956524.0, + "step": 9487 + }, + { + "epoch": 1.7619312906220985, + "grad_norm": 1.5189404487609863, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8739287257194519, + "num_tokens": 345994256.0, + "step": 9488 + }, + { + "epoch": 1.762116991643454, + "grad_norm": 1.5079065561294556, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8832978010177612, + "num_tokens": 346029901.0, + "step": 9489 + }, + { + "epoch": 1.7623026926648095, + "grad_norm": 1.4569408893585205, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.883448600769043, + "num_tokens": 346067052.0, + "step": 9490 + }, + { + "epoch": 1.7624883936861653, + "grad_norm": 1.6444014310836792, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8737971186637878, + "num_tokens": 346099922.0, + "step": 9491 + }, + { + "epoch": 1.762674094707521, + "grad_norm": 1.5611391067504883, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8808550238609314, + "num_tokens": 346134272.0, + "step": 9492 + }, + { + "epoch": 1.7628597957288765, + "grad_norm": 1.4959940910339355, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8800171613693237, + "num_tokens": 346170922.0, + "step": 9493 + }, + { + "epoch": 1.763045496750232, + "grad_norm": 1.6353546380996704, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8716637492179871, + "num_tokens": 346207967.0, + "step": 9494 + }, + { + "epoch": 1.7632311977715878, + "grad_norm": 1.6629743576049805, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.868699312210083, + "num_tokens": 346243951.0, + "step": 9495 + }, + { + "epoch": 1.7634168987929435, + "grad_norm": 1.4946513175964355, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8627446889877319, + "num_tokens": 346285847.0, + "step": 9496 + }, + { + "epoch": 1.763602599814299, + "grad_norm": 1.5017644166946411, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8675879240036011, + "num_tokens": 346327993.0, + "step": 9497 + }, + { + "epoch": 1.7637883008356545, + "grad_norm": 1.5834916830062866, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8735264539718628, + "num_tokens": 346361972.0, + "step": 9498 + }, + { + "epoch": 1.7639740018570103, + "grad_norm": 1.59297513961792, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8424139022827148, + "num_tokens": 346399833.0, + "step": 9499 + }, + { + "epoch": 1.764159702878366, + "grad_norm": 1.5211890935897827, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8787007331848145, + "num_tokens": 346436408.0, + "step": 9500 + }, + { + "epoch": 1.7643454038997215, + "grad_norm": 1.585843563079834, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8690844774246216, + "num_tokens": 346471779.0, + "step": 9501 + }, + { + "epoch": 1.764531104921077, + "grad_norm": 1.4754947423934937, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8728330731391907, + "num_tokens": 346512521.0, + "step": 9502 + }, + { + "epoch": 1.7647168059424327, + "grad_norm": 1.491733193397522, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8756924271583557, + "num_tokens": 346549906.0, + "step": 9503 + }, + { + "epoch": 1.7649025069637883, + "grad_norm": 1.4640966653823853, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8794119954109192, + "num_tokens": 346588147.0, + "step": 9504 + }, + { + "epoch": 1.7650882079851438, + "grad_norm": 1.3994505405426025, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8840972185134888, + "num_tokens": 346628835.0, + "step": 9505 + }, + { + "epoch": 1.7652739090064995, + "grad_norm": 1.4395644664764404, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8770925998687744, + "num_tokens": 346667013.0, + "step": 9506 + }, + { + "epoch": 1.7654596100278552, + "grad_norm": 1.4945967197418213, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8832969069480896, + "num_tokens": 346704570.0, + "step": 9507 + }, + { + "epoch": 1.7656453110492107, + "grad_norm": 1.6171120405197144, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8900719881057739, + "num_tokens": 346735695.0, + "step": 9508 + }, + { + "epoch": 1.7658310120705663, + "grad_norm": 1.6621685028076172, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8862019777297974, + "num_tokens": 346768644.0, + "step": 9509 + }, + { + "epoch": 1.766016713091922, + "grad_norm": 1.6116265058517456, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.88542640209198, + "num_tokens": 346802024.0, + "step": 9510 + }, + { + "epoch": 1.7662024141132777, + "grad_norm": 1.4208890199661255, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8844645023345947, + "num_tokens": 346842731.0, + "step": 9511 + }, + { + "epoch": 1.7663881151346332, + "grad_norm": 1.465790867805481, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.879537045955658, + "num_tokens": 346885339.0, + "step": 9512 + }, + { + "epoch": 1.7665738161559887, + "grad_norm": 1.4011430740356445, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8885998725891113, + "num_tokens": 346923601.0, + "step": 9513 + }, + { + "epoch": 1.7667595171773445, + "grad_norm": 1.5778326988220215, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8780681490898132, + "num_tokens": 346957872.0, + "step": 9514 + }, + { + "epoch": 1.7669452181987002, + "grad_norm": 1.435280442237854, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8822750449180603, + "num_tokens": 347000269.0, + "step": 9515 + }, + { + "epoch": 1.7671309192200557, + "grad_norm": 1.5909042358398438, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8843592405319214, + "num_tokens": 347032744.0, + "step": 9516 + }, + { + "epoch": 1.7673166202414112, + "grad_norm": 1.5691397190093994, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.874609649181366, + "num_tokens": 347072369.0, + "step": 9517 + }, + { + "epoch": 1.767502321262767, + "grad_norm": 1.5614352226257324, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8688692450523376, + "num_tokens": 347113638.0, + "step": 9518 + }, + { + "epoch": 1.7676880222841227, + "grad_norm": 1.6296632289886475, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8832343816757202, + "num_tokens": 347147966.0, + "step": 9519 + }, + { + "epoch": 1.7678737233054782, + "grad_norm": 1.4652442932128906, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8829139471054077, + "num_tokens": 347187087.0, + "step": 9520 + }, + { + "epoch": 1.7680594243268337, + "grad_norm": 1.4997390508651733, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8949241042137146, + "num_tokens": 347222309.0, + "step": 9521 + }, + { + "epoch": 1.7682451253481895, + "grad_norm": 1.6905543804168701, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8786805868148804, + "num_tokens": 347253877.0, + "step": 9522 + }, + { + "epoch": 1.7684308263695452, + "grad_norm": 1.396070122718811, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8905789852142334, + "num_tokens": 347294921.0, + "step": 9523 + }, + { + "epoch": 1.7686165273909007, + "grad_norm": 1.4471855163574219, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8884522914886475, + "num_tokens": 347330881.0, + "step": 9524 + }, + { + "epoch": 1.7688022284122562, + "grad_norm": 1.3576102256774902, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8832696080207825, + "num_tokens": 347376643.0, + "step": 9525 + }, + { + "epoch": 1.768987929433612, + "grad_norm": 1.5672292709350586, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8722184896469116, + "num_tokens": 347411323.0, + "step": 9526 + }, + { + "epoch": 1.7691736304549674, + "grad_norm": 1.5015323162078857, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.875064492225647, + "num_tokens": 347451363.0, + "step": 9527 + }, + { + "epoch": 1.769359331476323, + "grad_norm": 1.8366237878799438, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8888375163078308, + "num_tokens": 347481138.0, + "step": 9528 + }, + { + "epoch": 1.7695450324976787, + "grad_norm": 1.5747621059417725, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8722984194755554, + "num_tokens": 347517085.0, + "step": 9529 + }, + { + "epoch": 1.7697307335190344, + "grad_norm": 1.6271617412567139, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8750530481338501, + "num_tokens": 347550297.0, + "step": 9530 + }, + { + "epoch": 1.76991643454039, + "grad_norm": 1.5696136951446533, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8798900842666626, + "num_tokens": 347585465.0, + "step": 9531 + }, + { + "epoch": 1.7701021355617454, + "grad_norm": 1.489476203918457, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8601089715957642, + "num_tokens": 347623425.0, + "step": 9532 + }, + { + "epoch": 1.7702878365831012, + "grad_norm": 1.6068296432495117, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8916866779327393, + "num_tokens": 347661261.0, + "step": 9533 + }, + { + "epoch": 1.770473537604457, + "grad_norm": 1.653645634651184, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8765732049942017, + "num_tokens": 347698518.0, + "step": 9534 + }, + { + "epoch": 1.7706592386258124, + "grad_norm": 1.7768418788909912, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8694600462913513, + "num_tokens": 347731606.0, + "step": 9535 + }, + { + "epoch": 1.770844939647168, + "grad_norm": 1.4888347387313843, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8954386115074158, + "num_tokens": 347767065.0, + "step": 9536 + }, + { + "epoch": 1.7710306406685237, + "grad_norm": 1.6857719421386719, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.872563898563385, + "num_tokens": 347800056.0, + "step": 9537 + }, + { + "epoch": 1.7712163416898794, + "grad_norm": 1.771323323249817, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8749054670333862, + "num_tokens": 347833273.0, + "step": 9538 + }, + { + "epoch": 1.771402042711235, + "grad_norm": 1.6328896284103394, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8775198459625244, + "num_tokens": 347866815.0, + "step": 9539 + }, + { + "epoch": 1.7715877437325904, + "grad_norm": 1.4245851039886475, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.874398946762085, + "num_tokens": 347906901.0, + "step": 9540 + }, + { + "epoch": 1.7717734447539462, + "grad_norm": 1.6009371280670166, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8748696446418762, + "num_tokens": 347942246.0, + "step": 9541 + }, + { + "epoch": 1.771959145775302, + "grad_norm": 1.4574185609817505, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.888282835483551, + "num_tokens": 347980941.0, + "step": 9542 + }, + { + "epoch": 1.7721448467966574, + "grad_norm": 1.5518414974212646, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8791012763977051, + "num_tokens": 348012207.0, + "step": 9543 + }, + { + "epoch": 1.772330547818013, + "grad_norm": 1.5689387321472168, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8696780204772949, + "num_tokens": 348050176.0, + "step": 9544 + }, + { + "epoch": 1.7725162488393686, + "grad_norm": 1.4943596124649048, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8877016305923462, + "num_tokens": 348087796.0, + "step": 9545 + }, + { + "epoch": 1.7727019498607244, + "grad_norm": 1.5219333171844482, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8793184757232666, + "num_tokens": 348123028.0, + "step": 9546 + }, + { + "epoch": 1.77288765088208, + "grad_norm": 1.554938793182373, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8642467856407166, + "num_tokens": 348165220.0, + "step": 9547 + }, + { + "epoch": 1.7730733519034354, + "grad_norm": 1.6933095455169678, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8843332529067993, + "num_tokens": 348194877.0, + "step": 9548 + }, + { + "epoch": 1.7732590529247911, + "grad_norm": 1.5181795358657837, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8732532262802124, + "num_tokens": 348229703.0, + "step": 9549 + }, + { + "epoch": 1.7734447539461469, + "grad_norm": 1.4930250644683838, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8744590878486633, + "num_tokens": 348271780.0, + "step": 9550 + }, + { + "epoch": 1.7736304549675022, + "grad_norm": 1.626120686531067, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8742585182189941, + "num_tokens": 348304749.0, + "step": 9551 + }, + { + "epoch": 1.773816155988858, + "grad_norm": 1.6293575763702393, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8848525285720825, + "num_tokens": 348335327.0, + "step": 9552 + }, + { + "epoch": 1.7740018570102136, + "grad_norm": 1.5217469930648804, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.873024582862854, + "num_tokens": 348372080.0, + "step": 9553 + }, + { + "epoch": 1.7741875580315691, + "grad_norm": 1.5391985177993774, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8899335265159607, + "num_tokens": 348404044.0, + "step": 9554 + }, + { + "epoch": 1.7743732590529246, + "grad_norm": 1.451482892036438, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8919090032577515, + "num_tokens": 348439288.0, + "step": 9555 + }, + { + "epoch": 1.7745589600742804, + "grad_norm": 1.5853434801101685, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8697388172149658, + "num_tokens": 348477414.0, + "step": 9556 + }, + { + "epoch": 1.7747446610956361, + "grad_norm": 1.4874029159545898, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8866027593612671, + "num_tokens": 348514020.0, + "step": 9557 + }, + { + "epoch": 1.7749303621169916, + "grad_norm": 1.4687446355819702, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8928079605102539, + "num_tokens": 348548279.0, + "step": 9558 + }, + { + "epoch": 1.7751160631383471, + "grad_norm": 1.5526199340820312, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8838709592819214, + "num_tokens": 348583850.0, + "step": 9559 + }, + { + "epoch": 1.7753017641597029, + "grad_norm": 1.6069940328598022, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8556522130966187, + "num_tokens": 348620566.0, + "step": 9560 + }, + { + "epoch": 1.7754874651810586, + "grad_norm": 1.5215498208999634, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8666289448738098, + "num_tokens": 348662121.0, + "step": 9561 + }, + { + "epoch": 1.775673166202414, + "grad_norm": 1.511864185333252, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8774492740631104, + "num_tokens": 348700189.0, + "step": 9562 + }, + { + "epoch": 1.7758588672237696, + "grad_norm": 1.502879023551941, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8760210275650024, + "num_tokens": 348736345.0, + "step": 9563 + }, + { + "epoch": 1.7760445682451254, + "grad_norm": 1.5517934560775757, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8681921362876892, + "num_tokens": 348775065.0, + "step": 9564 + }, + { + "epoch": 1.776230269266481, + "grad_norm": 1.5650854110717773, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8545317053794861, + "num_tokens": 348817210.0, + "step": 9565 + }, + { + "epoch": 1.7764159702878366, + "grad_norm": 1.6045266389846802, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8753123879432678, + "num_tokens": 348850443.0, + "step": 9566 + }, + { + "epoch": 1.776601671309192, + "grad_norm": 1.5706571340560913, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8901854753494263, + "num_tokens": 348882965.0, + "step": 9567 + }, + { + "epoch": 1.7767873723305478, + "grad_norm": 1.4845727682113647, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8841195702552795, + "num_tokens": 348919151.0, + "step": 9568 + }, + { + "epoch": 1.7769730733519036, + "grad_norm": 1.4896469116210938, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8928808569908142, + "num_tokens": 348952313.0, + "step": 9569 + }, + { + "epoch": 1.777158774373259, + "grad_norm": 1.3795777559280396, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8729345798492432, + "num_tokens": 348995480.0, + "step": 9570 + }, + { + "epoch": 1.7773444753946146, + "grad_norm": 1.6625968217849731, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8817341923713684, + "num_tokens": 349026229.0, + "step": 9571 + }, + { + "epoch": 1.7775301764159703, + "grad_norm": 1.473987102508545, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8838331699371338, + "num_tokens": 349062446.0, + "step": 9572 + }, + { + "epoch": 1.777715877437326, + "grad_norm": 1.4818722009658813, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.884880781173706, + "num_tokens": 349098161.0, + "step": 9573 + }, + { + "epoch": 1.7779015784586816, + "grad_norm": 1.514746904373169, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8869634866714478, + "num_tokens": 349131699.0, + "step": 9574 + }, + { + "epoch": 1.778087279480037, + "grad_norm": 1.6694869995117188, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8972598314285278, + "num_tokens": 349159500.0, + "step": 9575 + }, + { + "epoch": 1.7782729805013928, + "grad_norm": 1.8197656869888306, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8719087243080139, + "num_tokens": 349188835.0, + "step": 9576 + }, + { + "epoch": 1.7784586815227483, + "grad_norm": 1.556930422782898, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8807467222213745, + "num_tokens": 349225313.0, + "step": 9577 + }, + { + "epoch": 1.7786443825441038, + "grad_norm": 1.5310111045837402, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8821743130683899, + "num_tokens": 349258419.0, + "step": 9578 + }, + { + "epoch": 1.7788300835654596, + "grad_norm": 1.5438603162765503, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8779366612434387, + "num_tokens": 349294554.0, + "step": 9579 + }, + { + "epoch": 1.7790157845868153, + "grad_norm": 1.5498278141021729, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8644996881484985, + "num_tokens": 349335326.0, + "step": 9580 + }, + { + "epoch": 1.7792014856081708, + "grad_norm": 1.5418701171875, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8847939372062683, + "num_tokens": 349368938.0, + "step": 9581 + }, + { + "epoch": 1.7793871866295263, + "grad_norm": 1.5031464099884033, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8800652623176575, + "num_tokens": 349406500.0, + "step": 9582 + }, + { + "epoch": 1.779572887650882, + "grad_norm": 1.5589168071746826, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8639489412307739, + "num_tokens": 349443289.0, + "step": 9583 + }, + { + "epoch": 1.7797585886722378, + "grad_norm": 1.40904700756073, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8894592523574829, + "num_tokens": 349484491.0, + "step": 9584 + }, + { + "epoch": 1.7799442896935933, + "grad_norm": 1.5994526147842407, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8750617504119873, + "num_tokens": 349519070.0, + "step": 9585 + }, + { + "epoch": 1.7801299907149488, + "grad_norm": 1.5350886583328247, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.87786865234375, + "num_tokens": 349553705.0, + "step": 9586 + }, + { + "epoch": 1.7803156917363046, + "grad_norm": 1.6889545917510986, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.875722348690033, + "num_tokens": 349586864.0, + "step": 9587 + }, + { + "epoch": 1.7805013927576603, + "grad_norm": 1.5794342756271362, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8669712543487549, + "num_tokens": 349626599.0, + "step": 9588 + }, + { + "epoch": 1.7806870937790158, + "grad_norm": 1.530991792678833, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8697071075439453, + "num_tokens": 349667234.0, + "step": 9589 + }, + { + "epoch": 1.7808727948003713, + "grad_norm": 1.5535551309585571, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8888985514640808, + "num_tokens": 349703362.0, + "step": 9590 + }, + { + "epoch": 1.781058495821727, + "grad_norm": 1.6372250318527222, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8732285499572754, + "num_tokens": 349734052.0, + "step": 9591 + }, + { + "epoch": 1.7812441968430828, + "grad_norm": 1.5181384086608887, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8672102689743042, + "num_tokens": 349777497.0, + "step": 9592 + }, + { + "epoch": 1.7814298978644383, + "grad_norm": 1.5920937061309814, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8671069741249084, + "num_tokens": 349813858.0, + "step": 9593 + }, + { + "epoch": 1.7816155988857938, + "grad_norm": 1.5496304035186768, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8864634037017822, + "num_tokens": 349845999.0, + "step": 9594 + }, + { + "epoch": 1.7818012999071495, + "grad_norm": 1.5442485809326172, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8860620260238647, + "num_tokens": 349879663.0, + "step": 9595 + }, + { + "epoch": 1.7819870009285053, + "grad_norm": 1.7018418312072754, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8725634813308716, + "num_tokens": 349911821.0, + "step": 9596 + }, + { + "epoch": 1.7821727019498608, + "grad_norm": 1.5494974851608276, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8744518160820007, + "num_tokens": 349946678.0, + "step": 9597 + }, + { + "epoch": 1.7823584029712163, + "grad_norm": 1.4677058458328247, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8840857744216919, + "num_tokens": 349981082.0, + "step": 9598 + }, + { + "epoch": 1.782544103992572, + "grad_norm": 1.4252811670303345, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8816649913787842, + "num_tokens": 350021758.0, + "step": 9599 + }, + { + "epoch": 1.7827298050139275, + "grad_norm": 1.6569889783859253, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8810855746269226, + "num_tokens": 350054690.0, + "step": 9600 + }, + { + "epoch": 1.782915506035283, + "grad_norm": 1.5093659162521362, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8891341090202332, + "num_tokens": 350090237.0, + "step": 9601 + }, + { + "epoch": 1.7831012070566388, + "grad_norm": 1.6300592422485352, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8935906291007996, + "num_tokens": 350121886.0, + "step": 9602 + }, + { + "epoch": 1.7832869080779945, + "grad_norm": 1.7127184867858887, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8853692412376404, + "num_tokens": 350151402.0, + "step": 9603 + }, + { + "epoch": 1.78347260909935, + "grad_norm": 1.450716495513916, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8830764293670654, + "num_tokens": 350190211.0, + "step": 9604 + }, + { + "epoch": 1.7836583101207055, + "grad_norm": 1.6686826944351196, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8756225109100342, + "num_tokens": 350221980.0, + "step": 9605 + }, + { + "epoch": 1.7838440111420613, + "grad_norm": 1.4921417236328125, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8738273978233337, + "num_tokens": 350261531.0, + "step": 9606 + }, + { + "epoch": 1.784029712163417, + "grad_norm": 1.4712953567504883, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8800455331802368, + "num_tokens": 350302368.0, + "step": 9607 + }, + { + "epoch": 1.7842154131847725, + "grad_norm": 1.587664008140564, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8704957365989685, + "num_tokens": 350339953.0, + "step": 9608 + }, + { + "epoch": 1.784401114206128, + "grad_norm": 1.5627347230911255, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8829771280288696, + "num_tokens": 350372603.0, + "step": 9609 + }, + { + "epoch": 1.7845868152274837, + "grad_norm": 1.6119213104248047, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8827238082885742, + "num_tokens": 350408632.0, + "step": 9610 + }, + { + "epoch": 1.7847725162488395, + "grad_norm": 1.4179190397262573, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8806076645851135, + "num_tokens": 350446719.0, + "step": 9611 + }, + { + "epoch": 1.784958217270195, + "grad_norm": 1.4429765939712524, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8854368925094604, + "num_tokens": 350483989.0, + "step": 9612 + }, + { + "epoch": 1.7851439182915505, + "grad_norm": 1.636151909828186, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8728605508804321, + "num_tokens": 350514922.0, + "step": 9613 + }, + { + "epoch": 1.7853296193129062, + "grad_norm": 1.4721516370773315, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8938162326812744, + "num_tokens": 350552224.0, + "step": 9614 + }, + { + "epoch": 1.785515320334262, + "grad_norm": 1.5112732648849487, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8798021078109741, + "num_tokens": 350587817.0, + "step": 9615 + }, + { + "epoch": 1.7857010213556175, + "grad_norm": 1.4900503158569336, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.9007055163383484, + "num_tokens": 350619131.0, + "step": 9616 + }, + { + "epoch": 1.785886722376973, + "grad_norm": 1.5613890886306763, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8647385835647583, + "num_tokens": 350654183.0, + "step": 9617 + }, + { + "epoch": 1.7860724233983287, + "grad_norm": 1.6877871751785278, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8880537748336792, + "num_tokens": 350683107.0, + "step": 9618 + }, + { + "epoch": 1.7862581244196845, + "grad_norm": 1.4963343143463135, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8821301460266113, + "num_tokens": 350716809.0, + "step": 9619 + }, + { + "epoch": 1.78644382544104, + "grad_norm": 1.5101817846298218, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8677393198013306, + "num_tokens": 350756715.0, + "step": 9620 + }, + { + "epoch": 1.7866295264623955, + "grad_norm": 1.5429264307022095, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8632354736328125, + "num_tokens": 350793303.0, + "step": 9621 + }, + { + "epoch": 1.7868152274837512, + "grad_norm": 1.4433434009552002, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8820314407348633, + "num_tokens": 350831517.0, + "step": 9622 + }, + { + "epoch": 1.7870009285051067, + "grad_norm": 1.5058932304382324, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8797788619995117, + "num_tokens": 350868566.0, + "step": 9623 + }, + { + "epoch": 1.7871866295264622, + "grad_norm": 1.4259214401245117, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8910597562789917, + "num_tokens": 350905288.0, + "step": 9624 + }, + { + "epoch": 1.787372330547818, + "grad_norm": 1.4753631353378296, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8909692168235779, + "num_tokens": 350942477.0, + "step": 9625 + }, + { + "epoch": 1.7875580315691737, + "grad_norm": 1.5849839448928833, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.875087559223175, + "num_tokens": 350976262.0, + "step": 9626 + }, + { + "epoch": 1.7877437325905292, + "grad_norm": 1.3515416383743286, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8840636610984802, + "num_tokens": 351021028.0, + "step": 9627 + }, + { + "epoch": 1.7879294336118847, + "grad_norm": 1.6457014083862305, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8828777074813843, + "num_tokens": 351051686.0, + "step": 9628 + }, + { + "epoch": 1.7881151346332405, + "grad_norm": 1.508302092552185, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8694103360176086, + "num_tokens": 351094589.0, + "step": 9629 + }, + { + "epoch": 1.7883008356545962, + "grad_norm": 1.557686448097229, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8946695923805237, + "num_tokens": 351125378.0, + "step": 9630 + }, + { + "epoch": 1.7884865366759517, + "grad_norm": 1.523587942123413, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8692526817321777, + "num_tokens": 351163307.0, + "step": 9631 + }, + { + "epoch": 1.7886722376973072, + "grad_norm": 1.4799525737762451, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8879992961883545, + "num_tokens": 351201113.0, + "step": 9632 + }, + { + "epoch": 1.788857938718663, + "grad_norm": 1.3288604021072388, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8756818771362305, + "num_tokens": 351245877.0, + "step": 9633 + }, + { + "epoch": 1.7890436397400187, + "grad_norm": 1.579798936843872, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8769106864929199, + "num_tokens": 351278799.0, + "step": 9634 + }, + { + "epoch": 1.7892293407613742, + "grad_norm": 1.4985724687576294, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8646342754364014, + "num_tokens": 351316909.0, + "step": 9635 + }, + { + "epoch": 1.7894150417827297, + "grad_norm": 1.460660457611084, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8769674897193909, + "num_tokens": 351355586.0, + "step": 9636 + }, + { + "epoch": 1.7896007428040854, + "grad_norm": 1.4622024297714233, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8796907663345337, + "num_tokens": 351392599.0, + "step": 9637 + }, + { + "epoch": 1.7897864438254412, + "grad_norm": 1.6589382886886597, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8704370260238647, + "num_tokens": 351422974.0, + "step": 9638 + }, + { + "epoch": 1.7899721448467967, + "grad_norm": 1.4792321920394897, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8931519985198975, + "num_tokens": 351462409.0, + "step": 9639 + }, + { + "epoch": 1.7901578458681522, + "grad_norm": 1.7265832424163818, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8626309633255005, + "num_tokens": 351497386.0, + "step": 9640 + }, + { + "epoch": 1.790343546889508, + "grad_norm": 1.6099506616592407, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8615845441818237, + "num_tokens": 351537317.0, + "step": 9641 + }, + { + "epoch": 1.7905292479108637, + "grad_norm": 1.5576783418655396, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8704754114151001, + "num_tokens": 351578134.0, + "step": 9642 + }, + { + "epoch": 1.7907149489322192, + "grad_norm": 1.4519039392471313, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8802791237831116, + "num_tokens": 351618839.0, + "step": 9643 + }, + { + "epoch": 1.7909006499535747, + "grad_norm": 1.563706636428833, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8763158321380615, + "num_tokens": 351654911.0, + "step": 9644 + }, + { + "epoch": 1.7910863509749304, + "grad_norm": 1.581329107284546, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8656644821166992, + "num_tokens": 351691689.0, + "step": 9645 + }, + { + "epoch": 1.7912720519962861, + "grad_norm": 1.7977862358093262, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8947175741195679, + "num_tokens": 351719602.0, + "step": 9646 + }, + { + "epoch": 1.7914577530176417, + "grad_norm": 1.5505229234695435, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8847631216049194, + "num_tokens": 351753441.0, + "step": 9647 + }, + { + "epoch": 1.7916434540389972, + "grad_norm": 1.444451928138733, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8790152072906494, + "num_tokens": 351792915.0, + "step": 9648 + }, + { + "epoch": 1.791829155060353, + "grad_norm": 1.529083490371704, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8886100649833679, + "num_tokens": 351825815.0, + "step": 9649 + }, + { + "epoch": 1.7920148560817084, + "grad_norm": 1.658899188041687, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8826436996459961, + "num_tokens": 351858009.0, + "step": 9650 + }, + { + "epoch": 1.792200557103064, + "grad_norm": 1.3605626821517944, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8829255700111389, + "num_tokens": 351900154.0, + "step": 9651 + }, + { + "epoch": 1.7923862581244197, + "grad_norm": 1.680234432220459, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.84999018907547, + "num_tokens": 351938202.0, + "step": 9652 + }, + { + "epoch": 1.7925719591457754, + "grad_norm": 1.4772688150405884, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8768055438995361, + "num_tokens": 351976904.0, + "step": 9653 + }, + { + "epoch": 1.792757660167131, + "grad_norm": 1.3988343477249146, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8744839429855347, + "num_tokens": 352018402.0, + "step": 9654 + }, + { + "epoch": 1.7929433611884864, + "grad_norm": 1.6065038442611694, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8522737622261047, + "num_tokens": 352054113.0, + "step": 9655 + }, + { + "epoch": 1.7931290622098421, + "grad_norm": 1.3890429735183716, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8786851167678833, + "num_tokens": 352097057.0, + "step": 9656 + }, + { + "epoch": 1.7933147632311979, + "grad_norm": 1.5824966430664062, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8739870190620422, + "num_tokens": 352132582.0, + "step": 9657 + }, + { + "epoch": 1.7935004642525534, + "grad_norm": 1.582831621170044, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8734766244888306, + "num_tokens": 352165804.0, + "step": 9658 + }, + { + "epoch": 1.793686165273909, + "grad_norm": 1.5528069734573364, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8740801811218262, + "num_tokens": 352203062.0, + "step": 9659 + }, + { + "epoch": 1.7938718662952646, + "grad_norm": 1.559608817100525, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8807400465011597, + "num_tokens": 352240269.0, + "step": 9660 + }, + { + "epoch": 1.7940575673166204, + "grad_norm": 1.671455979347229, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8623203635215759, + "num_tokens": 352272764.0, + "step": 9661 + }, + { + "epoch": 1.7942432683379759, + "grad_norm": 1.6776593923568726, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8671668767929077, + "num_tokens": 352303210.0, + "step": 9662 + }, + { + "epoch": 1.7944289693593314, + "grad_norm": 1.6036964654922485, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8790954351425171, + "num_tokens": 352334589.0, + "step": 9663 + }, + { + "epoch": 1.7946146703806871, + "grad_norm": 1.5412958860397339, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8841198682785034, + "num_tokens": 352366923.0, + "step": 9664 + }, + { + "epoch": 1.7948003714020428, + "grad_norm": 1.516945242881775, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8783348798751831, + "num_tokens": 352402461.0, + "step": 9665 + }, + { + "epoch": 1.7949860724233984, + "grad_norm": 1.5647307634353638, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8678324818611145, + "num_tokens": 352440813.0, + "step": 9666 + }, + { + "epoch": 1.7951717734447539, + "grad_norm": 1.602087140083313, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8654341697692871, + "num_tokens": 352479282.0, + "step": 9667 + }, + { + "epoch": 1.7953574744661096, + "grad_norm": 1.6546131372451782, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8642704486846924, + "num_tokens": 352514601.0, + "step": 9668 + }, + { + "epoch": 1.7955431754874653, + "grad_norm": 1.5536381006240845, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8823108673095703, + "num_tokens": 352549792.0, + "step": 9669 + }, + { + "epoch": 1.7957288765088208, + "grad_norm": 1.5754314661026, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8787591457366943, + "num_tokens": 352584420.0, + "step": 9670 + }, + { + "epoch": 1.7959145775301764, + "grad_norm": 1.7003302574157715, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8772180080413818, + "num_tokens": 352615973.0, + "step": 9671 + }, + { + "epoch": 1.796100278551532, + "grad_norm": 1.508776068687439, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8772094249725342, + "num_tokens": 352653342.0, + "step": 9672 + }, + { + "epoch": 1.7962859795728876, + "grad_norm": 1.5284241437911987, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8783742189407349, + "num_tokens": 352691066.0, + "step": 9673 + }, + { + "epoch": 1.7964716805942431, + "grad_norm": 1.3838281631469727, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8808220624923706, + "num_tokens": 352729918.0, + "step": 9674 + }, + { + "epoch": 1.7966573816155988, + "grad_norm": 1.4840917587280273, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8753130435943604, + "num_tokens": 352768624.0, + "step": 9675 + }, + { + "epoch": 1.7968430826369546, + "grad_norm": 1.5085946321487427, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8717960119247437, + "num_tokens": 352809101.0, + "step": 9676 + }, + { + "epoch": 1.79702878365831, + "grad_norm": 1.577890396118164, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8872148394584656, + "num_tokens": 352842162.0, + "step": 9677 + }, + { + "epoch": 1.7972144846796656, + "grad_norm": 1.6127548217773438, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8648717999458313, + "num_tokens": 352879574.0, + "step": 9678 + }, + { + "epoch": 1.7974001857010213, + "grad_norm": 1.7043431997299194, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.879738986492157, + "num_tokens": 352916293.0, + "step": 9679 + }, + { + "epoch": 1.797585886722377, + "grad_norm": 1.5547767877578735, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8740615844726562, + "num_tokens": 352956613.0, + "step": 9680 + }, + { + "epoch": 1.7977715877437326, + "grad_norm": 1.5378937721252441, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8789530992507935, + "num_tokens": 352997074.0, + "step": 9681 + }, + { + "epoch": 1.797957288765088, + "grad_norm": 1.4737803936004639, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.893984317779541, + "num_tokens": 353033278.0, + "step": 9682 + }, + { + "epoch": 1.7981429897864438, + "grad_norm": 1.4592610597610474, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8826301097869873, + "num_tokens": 353071684.0, + "step": 9683 + }, + { + "epoch": 1.7983286908077996, + "grad_norm": 1.6019303798675537, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8835870623588562, + "num_tokens": 353109608.0, + "step": 9684 + }, + { + "epoch": 1.798514391829155, + "grad_norm": 1.5133919715881348, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8851303458213806, + "num_tokens": 353142711.0, + "step": 9685 + }, + { + "epoch": 1.7987000928505106, + "grad_norm": 1.6357718706130981, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8661370277404785, + "num_tokens": 353177469.0, + "step": 9686 + }, + { + "epoch": 1.7988857938718663, + "grad_norm": 1.548923134803772, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8840243816375732, + "num_tokens": 353214925.0, + "step": 9687 + }, + { + "epoch": 1.799071494893222, + "grad_norm": 1.5778546333312988, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8868834376335144, + "num_tokens": 353248556.0, + "step": 9688 + }, + { + "epoch": 1.7992571959145776, + "grad_norm": 1.4849759340286255, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.88134765625, + "num_tokens": 353286293.0, + "step": 9689 + }, + { + "epoch": 1.799442896935933, + "grad_norm": 1.5160044431686401, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8712106943130493, + "num_tokens": 353324801.0, + "step": 9690 + }, + { + "epoch": 1.7996285979572888, + "grad_norm": 1.5835375785827637, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8798396587371826, + "num_tokens": 353358637.0, + "step": 9691 + }, + { + "epoch": 1.7998142989786445, + "grad_norm": 1.4483082294464111, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8831629157066345, + "num_tokens": 353399171.0, + "step": 9692 + }, + { + "epoch": 1.8, + "grad_norm": 1.5012022256851196, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8720972537994385, + "num_tokens": 353438648.0, + "step": 9693 + }, + { + "epoch": 1.8001857010213556, + "grad_norm": 1.529239296913147, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8734609484672546, + "num_tokens": 353477866.0, + "step": 9694 + }, + { + "epoch": 1.8003714020427113, + "grad_norm": 1.5889298915863037, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8824468851089478, + "num_tokens": 353510693.0, + "step": 9695 + }, + { + "epoch": 1.8005571030640668, + "grad_norm": 1.4028074741363525, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8885645270347595, + "num_tokens": 353547959.0, + "step": 9696 + }, + { + "epoch": 1.8007428040854223, + "grad_norm": 1.4433146715164185, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8879361152648926, + "num_tokens": 353586853.0, + "step": 9697 + }, + { + "epoch": 1.800928505106778, + "grad_norm": 1.6230984926223755, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8684630393981934, + "num_tokens": 353621833.0, + "step": 9698 + }, + { + "epoch": 1.8011142061281338, + "grad_norm": 1.7653781175613403, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.879070520401001, + "num_tokens": 353649751.0, + "step": 9699 + }, + { + "epoch": 1.8012999071494893, + "grad_norm": 1.4523589611053467, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8840333223342896, + "num_tokens": 353687749.0, + "step": 9700 + }, + { + "epoch": 1.8014856081708448, + "grad_norm": 1.5229945182800293, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8891891837120056, + "num_tokens": 353721810.0, + "step": 9701 + }, + { + "epoch": 1.8016713091922005, + "grad_norm": 1.4254099130630493, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8748533725738525, + "num_tokens": 353765604.0, + "step": 9702 + }, + { + "epoch": 1.8018570102135563, + "grad_norm": 1.5498727560043335, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8823455572128296, + "num_tokens": 353800646.0, + "step": 9703 + }, + { + "epoch": 1.8020427112349118, + "grad_norm": 1.4687132835388184, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8836371302604675, + "num_tokens": 353838675.0, + "step": 9704 + }, + { + "epoch": 1.8022284122562673, + "grad_norm": 1.6107474565505981, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.889528751373291, + "num_tokens": 353870017.0, + "step": 9705 + }, + { + "epoch": 1.802414113277623, + "grad_norm": 1.380807876586914, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8890792727470398, + "num_tokens": 353909174.0, + "step": 9706 + }, + { + "epoch": 1.8025998142989788, + "grad_norm": 1.5688295364379883, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8769446611404419, + "num_tokens": 353943202.0, + "step": 9707 + }, + { + "epoch": 1.8027855153203343, + "grad_norm": 1.6107550859451294, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8837293982505798, + "num_tokens": 353977679.0, + "step": 9708 + }, + { + "epoch": 1.8029712163416898, + "grad_norm": 1.7180981636047363, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.855048656463623, + "num_tokens": 354010621.0, + "step": 9709 + }, + { + "epoch": 1.8031569173630455, + "grad_norm": 1.6726218461990356, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.880427360534668, + "num_tokens": 354047123.0, + "step": 9710 + }, + { + "epoch": 1.8033426183844012, + "grad_norm": 1.6166365146636963, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8786377906799316, + "num_tokens": 354084661.0, + "step": 9711 + }, + { + "epoch": 1.8035283194057568, + "grad_norm": 1.3619202375411987, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8929952383041382, + "num_tokens": 354126475.0, + "step": 9712 + }, + { + "epoch": 1.8037140204271123, + "grad_norm": 1.3911592960357666, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8865475654602051, + "num_tokens": 354168323.0, + "step": 9713 + }, + { + "epoch": 1.803899721448468, + "grad_norm": 1.5754276514053345, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8675886392593384, + "num_tokens": 354207140.0, + "step": 9714 + }, + { + "epoch": 1.8040854224698237, + "grad_norm": 1.5353225469589233, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8577343225479126, + "num_tokens": 354249833.0, + "step": 9715 + }, + { + "epoch": 1.8042711234911792, + "grad_norm": 1.4914398193359375, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8747777938842773, + "num_tokens": 354286047.0, + "step": 9716 + }, + { + "epoch": 1.8044568245125348, + "grad_norm": 1.5913559198379517, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8789114356040955, + "num_tokens": 354319579.0, + "step": 9717 + }, + { + "epoch": 1.8046425255338905, + "grad_norm": 1.5723435878753662, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8831518292427063, + "num_tokens": 354356674.0, + "step": 9718 + }, + { + "epoch": 1.8048282265552462, + "grad_norm": 1.5549618005752563, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8660184144973755, + "num_tokens": 354397088.0, + "step": 9719 + }, + { + "epoch": 1.8050139275766015, + "grad_norm": 1.606351613998413, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8856579661369324, + "num_tokens": 354428491.0, + "step": 9720 + }, + { + "epoch": 1.8051996285979572, + "grad_norm": 1.6499593257904053, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.876121997833252, + "num_tokens": 354460229.0, + "step": 9721 + }, + { + "epoch": 1.805385329619313, + "grad_norm": 1.482206106185913, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8761581182479858, + "num_tokens": 354501163.0, + "step": 9722 + }, + { + "epoch": 1.8055710306406685, + "grad_norm": 1.5452731847763062, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.876207172870636, + "num_tokens": 354537599.0, + "step": 9723 + }, + { + "epoch": 1.805756731662024, + "grad_norm": 1.5680124759674072, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8754147291183472, + "num_tokens": 354573118.0, + "step": 9724 + }, + { + "epoch": 1.8059424326833797, + "grad_norm": 1.4264137744903564, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8749377131462097, + "num_tokens": 354612338.0, + "step": 9725 + }, + { + "epoch": 1.8061281337047355, + "grad_norm": 1.6457138061523438, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8744158744812012, + "num_tokens": 354646121.0, + "step": 9726 + }, + { + "epoch": 1.806313834726091, + "grad_norm": 1.5767446756362915, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8823861479759216, + "num_tokens": 354678508.0, + "step": 9727 + }, + { + "epoch": 1.8064995357474465, + "grad_norm": 1.4606744050979614, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8834434747695923, + "num_tokens": 354714180.0, + "step": 9728 + }, + { + "epoch": 1.8066852367688022, + "grad_norm": 1.5100892782211304, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8717827796936035, + "num_tokens": 354750279.0, + "step": 9729 + }, + { + "epoch": 1.806870937790158, + "grad_norm": 1.5665425062179565, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8731114864349365, + "num_tokens": 354787938.0, + "step": 9730 + }, + { + "epoch": 1.8070566388115135, + "grad_norm": 1.6528923511505127, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8817180395126343, + "num_tokens": 354820925.0, + "step": 9731 + }, + { + "epoch": 1.807242339832869, + "grad_norm": 1.5544166564941406, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8812571167945862, + "num_tokens": 354854436.0, + "step": 9732 + }, + { + "epoch": 1.8074280408542247, + "grad_norm": 1.4792225360870361, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.882887601852417, + "num_tokens": 354890413.0, + "step": 9733 + }, + { + "epoch": 1.8076137418755804, + "grad_norm": 1.5554713010787964, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8704163432121277, + "num_tokens": 354928752.0, + "step": 9734 + }, + { + "epoch": 1.807799442896936, + "grad_norm": 1.5357979536056519, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8878096342086792, + "num_tokens": 354966750.0, + "step": 9735 + }, + { + "epoch": 1.8079851439182915, + "grad_norm": 1.525457739830017, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8742824792861938, + "num_tokens": 355007116.0, + "step": 9736 + }, + { + "epoch": 1.8081708449396472, + "grad_norm": 1.5853153467178345, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8688840270042419, + "num_tokens": 355042427.0, + "step": 9737 + }, + { + "epoch": 1.808356545961003, + "grad_norm": 1.6438961029052734, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8763607740402222, + "num_tokens": 355073837.0, + "step": 9738 + }, + { + "epoch": 1.8085422469823584, + "grad_norm": 1.499155879020691, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8997080326080322, + "num_tokens": 355110631.0, + "step": 9739 + }, + { + "epoch": 1.808727948003714, + "grad_norm": 1.419363260269165, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8868277668952942, + "num_tokens": 355149368.0, + "step": 9740 + }, + { + "epoch": 1.8089136490250697, + "grad_norm": 1.6109484434127808, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8893289566040039, + "num_tokens": 355183853.0, + "step": 9741 + }, + { + "epoch": 1.8090993500464254, + "grad_norm": 1.6335762739181519, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8853764533996582, + "num_tokens": 355216040.0, + "step": 9742 + }, + { + "epoch": 1.809285051067781, + "grad_norm": 1.4511250257492065, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8822832107543945, + "num_tokens": 355255236.0, + "step": 9743 + }, + { + "epoch": 1.8094707520891364, + "grad_norm": 1.6173967123031616, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8703374862670898, + "num_tokens": 355288958.0, + "step": 9744 + }, + { + "epoch": 1.8096564531104922, + "grad_norm": 1.5191338062286377, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8807936906814575, + "num_tokens": 355325948.0, + "step": 9745 + }, + { + "epoch": 1.8098421541318477, + "grad_norm": 1.682031512260437, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8705721497535706, + "num_tokens": 355365158.0, + "step": 9746 + }, + { + "epoch": 1.8100278551532032, + "grad_norm": 1.5626513957977295, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8614071607589722, + "num_tokens": 355405648.0, + "step": 9747 + }, + { + "epoch": 1.810213556174559, + "grad_norm": 1.6146291494369507, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8807903528213501, + "num_tokens": 355443218.0, + "step": 9748 + }, + { + "epoch": 1.8103992571959147, + "grad_norm": 1.5093111991882324, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8887391090393066, + "num_tokens": 355482675.0, + "step": 9749 + }, + { + "epoch": 1.8105849582172702, + "grad_norm": 1.653222918510437, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8731946349143982, + "num_tokens": 355520981.0, + "step": 9750 + }, + { + "epoch": 1.8107706592386257, + "grad_norm": 1.5550081729888916, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8834753036499023, + "num_tokens": 355556911.0, + "step": 9751 + }, + { + "epoch": 1.8109563602599814, + "grad_norm": 1.7264164686203003, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8744150400161743, + "num_tokens": 355588776.0, + "step": 9752 + }, + { + "epoch": 1.8111420612813371, + "grad_norm": 1.5078753232955933, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8881930708885193, + "num_tokens": 355626063.0, + "step": 9753 + }, + { + "epoch": 1.8113277623026927, + "grad_norm": 1.4511622190475464, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8817858695983887, + "num_tokens": 355664099.0, + "step": 9754 + }, + { + "epoch": 1.8115134633240482, + "grad_norm": 1.4782464504241943, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8718651533126831, + "num_tokens": 355705726.0, + "step": 9755 + }, + { + "epoch": 1.811699164345404, + "grad_norm": 1.5040335655212402, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8844245672225952, + "num_tokens": 355742476.0, + "step": 9756 + }, + { + "epoch": 1.8118848653667596, + "grad_norm": 1.6735050678253174, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8712651133537292, + "num_tokens": 355780589.0, + "step": 9757 + }, + { + "epoch": 1.8120705663881151, + "grad_norm": 1.7540127038955688, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8844088912010193, + "num_tokens": 355812063.0, + "step": 9758 + }, + { + "epoch": 1.8122562674094707, + "grad_norm": 1.4670770168304443, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8658933639526367, + "num_tokens": 355854488.0, + "step": 9759 + }, + { + "epoch": 1.8124419684308264, + "grad_norm": 1.484210729598999, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8892368674278259, + "num_tokens": 355889676.0, + "step": 9760 + }, + { + "epoch": 1.8126276694521821, + "grad_norm": 1.577883005142212, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8721157312393188, + "num_tokens": 355927156.0, + "step": 9761 + }, + { + "epoch": 1.8128133704735376, + "grad_norm": 1.518086552619934, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8762747049331665, + "num_tokens": 355963308.0, + "step": 9762 + }, + { + "epoch": 1.8129990714948931, + "grad_norm": 1.516066074371338, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8748438954353333, + "num_tokens": 356001757.0, + "step": 9763 + }, + { + "epoch": 1.8131847725162489, + "grad_norm": 1.6572270393371582, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8640377521514893, + "num_tokens": 356039488.0, + "step": 9764 + }, + { + "epoch": 1.8133704735376046, + "grad_norm": 1.4828335046768188, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8832554221153259, + "num_tokens": 356077747.0, + "step": 9765 + }, + { + "epoch": 1.8135561745589601, + "grad_norm": 1.5821988582611084, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8813397884368896, + "num_tokens": 356111809.0, + "step": 9766 + }, + { + "epoch": 1.8137418755803156, + "grad_norm": 1.629548192024231, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8755472302436829, + "num_tokens": 356144669.0, + "step": 9767 + }, + { + "epoch": 1.8139275766016714, + "grad_norm": 1.4368674755096436, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8838204145431519, + "num_tokens": 356182685.0, + "step": 9768 + }, + { + "epoch": 1.8141132776230269, + "grad_norm": 1.6485413312911987, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8750899434089661, + "num_tokens": 356220284.0, + "step": 9769 + }, + { + "epoch": 1.8142989786443824, + "grad_norm": 1.5963844060897827, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8908426761627197, + "num_tokens": 356248059.0, + "step": 9770 + }, + { + "epoch": 1.8144846796657381, + "grad_norm": 1.4281750917434692, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8766491413116455, + "num_tokens": 356291236.0, + "step": 9771 + }, + { + "epoch": 1.8146703806870939, + "grad_norm": 1.6283470392227173, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8590670824050903, + "num_tokens": 356327370.0, + "step": 9772 + }, + { + "epoch": 1.8148560817084494, + "grad_norm": 1.5269886255264282, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8878194093704224, + "num_tokens": 356362806.0, + "step": 9773 + }, + { + "epoch": 1.8150417827298049, + "grad_norm": 1.4978643655776978, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.879653811454773, + "num_tokens": 356399186.0, + "step": 9774 + }, + { + "epoch": 1.8152274837511606, + "grad_norm": 1.5524500608444214, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8662261366844177, + "num_tokens": 356436513.0, + "step": 9775 + }, + { + "epoch": 1.8154131847725163, + "grad_norm": 1.6485549211502075, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8527777194976807, + "num_tokens": 356475244.0, + "step": 9776 + }, + { + "epoch": 1.8155988857938719, + "grad_norm": 1.4745715856552124, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8825100660324097, + "num_tokens": 356512778.0, + "step": 9777 + }, + { + "epoch": 1.8157845868152274, + "grad_norm": 1.5671159029006958, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8702129125595093, + "num_tokens": 356549130.0, + "step": 9778 + }, + { + "epoch": 1.815970287836583, + "grad_norm": 1.5680760145187378, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8911910057067871, + "num_tokens": 356583872.0, + "step": 9779 + }, + { + "epoch": 1.8161559888579388, + "grad_norm": 1.6687445640563965, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8780934810638428, + "num_tokens": 356618098.0, + "step": 9780 + }, + { + "epoch": 1.8163416898792943, + "grad_norm": 1.4643741846084595, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.87064129114151, + "num_tokens": 356660972.0, + "step": 9781 + }, + { + "epoch": 1.8165273909006499, + "grad_norm": 1.5768948793411255, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8958936929702759, + "num_tokens": 356695116.0, + "step": 9782 + }, + { + "epoch": 1.8167130919220056, + "grad_norm": 1.6111758947372437, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8714885711669922, + "num_tokens": 356731353.0, + "step": 9783 + }, + { + "epoch": 1.8168987929433613, + "grad_norm": 1.5461206436157227, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8826582431793213, + "num_tokens": 356765870.0, + "step": 9784 + }, + { + "epoch": 1.8170844939647168, + "grad_norm": 1.5459821224212646, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8797458410263062, + "num_tokens": 356805475.0, + "step": 9785 + }, + { + "epoch": 1.8172701949860723, + "grad_norm": 1.5138647556304932, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8831192255020142, + "num_tokens": 356845185.0, + "step": 9786 + }, + { + "epoch": 1.817455896007428, + "grad_norm": 1.4502431154251099, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8864472508430481, + "num_tokens": 356888324.0, + "step": 9787 + }, + { + "epoch": 1.8176415970287838, + "grad_norm": 1.5872358083724976, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8842333555221558, + "num_tokens": 356930476.0, + "step": 9788 + }, + { + "epoch": 1.8178272980501393, + "grad_norm": 1.4889992475509644, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8774119019508362, + "num_tokens": 356971155.0, + "step": 9789 + }, + { + "epoch": 1.8180129990714948, + "grad_norm": 1.6066356897354126, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8758898973464966, + "num_tokens": 357006692.0, + "step": 9790 + }, + { + "epoch": 1.8181987000928506, + "grad_norm": 1.569857120513916, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8856897354125977, + "num_tokens": 357039055.0, + "step": 9791 + }, + { + "epoch": 1.8183844011142063, + "grad_norm": 1.5175890922546387, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8693372011184692, + "num_tokens": 357080086.0, + "step": 9792 + }, + { + "epoch": 1.8185701021355616, + "grad_norm": 1.6168755292892456, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8572890162467957, + "num_tokens": 357117697.0, + "step": 9793 + }, + { + "epoch": 1.8187558031569173, + "grad_norm": 1.601778268814087, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8795768022537231, + "num_tokens": 357151762.0, + "step": 9794 + }, + { + "epoch": 1.818941504178273, + "grad_norm": 1.512163519859314, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8706520795822144, + "num_tokens": 357191327.0, + "step": 9795 + }, + { + "epoch": 1.8191272051996286, + "grad_norm": 1.4895669221878052, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8821016550064087, + "num_tokens": 357228219.0, + "step": 9796 + }, + { + "epoch": 1.819312906220984, + "grad_norm": 1.4569354057312012, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8889018297195435, + "num_tokens": 357265025.0, + "step": 9797 + }, + { + "epoch": 1.8194986072423398, + "grad_norm": 1.5200459957122803, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8802419900894165, + "num_tokens": 357300381.0, + "step": 9798 + }, + { + "epoch": 1.8196843082636955, + "grad_norm": 1.4189229011535645, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8831312656402588, + "num_tokens": 357339887.0, + "step": 9799 + }, + { + "epoch": 1.819870009285051, + "grad_norm": 1.4922462701797485, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8851973414421082, + "num_tokens": 357373058.0, + "step": 9800 + }, + { + "epoch": 1.8200557103064066, + "grad_norm": 1.441450595855713, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8773505687713623, + "num_tokens": 357416328.0, + "step": 9801 + }, + { + "epoch": 1.8202414113277623, + "grad_norm": 1.6108078956604004, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8679876923561096, + "num_tokens": 357451580.0, + "step": 9802 + }, + { + "epoch": 1.820427112349118, + "grad_norm": 1.5298408269882202, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8660889863967896, + "num_tokens": 357492397.0, + "step": 9803 + }, + { + "epoch": 1.8206128133704735, + "grad_norm": 1.4912962913513184, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8612720966339111, + "num_tokens": 357534065.0, + "step": 9804 + }, + { + "epoch": 1.820798514391829, + "grad_norm": 1.3902363777160645, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8909144997596741, + "num_tokens": 357575131.0, + "step": 9805 + }, + { + "epoch": 1.8209842154131848, + "grad_norm": 1.4780042171478271, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8793922662734985, + "num_tokens": 357618959.0, + "step": 9806 + }, + { + "epoch": 1.8211699164345405, + "grad_norm": 1.7274744510650635, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.86985844373703, + "num_tokens": 357652036.0, + "step": 9807 + }, + { + "epoch": 1.821355617455896, + "grad_norm": 1.5308758020401, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8765883445739746, + "num_tokens": 357690063.0, + "step": 9808 + }, + { + "epoch": 1.8215413184772515, + "grad_norm": 1.592436671257019, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8656140565872192, + "num_tokens": 357724609.0, + "step": 9809 + }, + { + "epoch": 1.8217270194986073, + "grad_norm": 1.730889081954956, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8930561542510986, + "num_tokens": 357751590.0, + "step": 9810 + }, + { + "epoch": 1.821912720519963, + "grad_norm": 1.5167607069015503, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8834012746810913, + "num_tokens": 357787670.0, + "step": 9811 + }, + { + "epoch": 1.8220984215413185, + "grad_norm": 1.5110715627670288, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8870220184326172, + "num_tokens": 357826299.0, + "step": 9812 + }, + { + "epoch": 1.822284122562674, + "grad_norm": 1.5142717361450195, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8816320896148682, + "num_tokens": 357862050.0, + "step": 9813 + }, + { + "epoch": 1.8224698235840298, + "grad_norm": 1.5546399354934692, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8733658790588379, + "num_tokens": 357896940.0, + "step": 9814 + }, + { + "epoch": 1.8226555246053855, + "grad_norm": 1.4548532962799072, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.880107045173645, + "num_tokens": 357934793.0, + "step": 9815 + }, + { + "epoch": 1.822841225626741, + "grad_norm": 1.581193447113037, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8707330822944641, + "num_tokens": 357971666.0, + "step": 9816 + }, + { + "epoch": 1.8230269266480965, + "grad_norm": 1.6556675434112549, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.875319242477417, + "num_tokens": 358002345.0, + "step": 9817 + }, + { + "epoch": 1.8232126276694522, + "grad_norm": 1.532159686088562, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8905807733535767, + "num_tokens": 358037935.0, + "step": 9818 + }, + { + "epoch": 1.8233983286908078, + "grad_norm": 1.5695416927337646, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8759244680404663, + "num_tokens": 358083104.0, + "step": 9819 + }, + { + "epoch": 1.8235840297121633, + "grad_norm": 1.3597594499588013, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8925602436065674, + "num_tokens": 358124950.0, + "step": 9820 + }, + { + "epoch": 1.823769730733519, + "grad_norm": 1.5293326377868652, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8828133940696716, + "num_tokens": 358161264.0, + "step": 9821 + }, + { + "epoch": 1.8239554317548747, + "grad_norm": 1.6107245683670044, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8839019536972046, + "num_tokens": 358196997.0, + "step": 9822 + }, + { + "epoch": 1.8241411327762302, + "grad_norm": 1.6041048765182495, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8813397884368896, + "num_tokens": 358231090.0, + "step": 9823 + }, + { + "epoch": 1.8243268337975858, + "grad_norm": 1.5744712352752686, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8777450323104858, + "num_tokens": 358269185.0, + "step": 9824 + }, + { + "epoch": 1.8245125348189415, + "grad_norm": 1.4294812679290771, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8905206918716431, + "num_tokens": 358306943.0, + "step": 9825 + }, + { + "epoch": 1.8246982358402972, + "grad_norm": 1.4298335313796997, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8832122087478638, + "num_tokens": 358348742.0, + "step": 9826 + }, + { + "epoch": 1.8248839368616527, + "grad_norm": 1.4080268144607544, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8762404918670654, + "num_tokens": 358391106.0, + "step": 9827 + }, + { + "epoch": 1.8250696378830082, + "grad_norm": 1.552267074584961, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8791953325271606, + "num_tokens": 358424789.0, + "step": 9828 + }, + { + "epoch": 1.825255338904364, + "grad_norm": 1.4787551164627075, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8790984153747559, + "num_tokens": 358464624.0, + "step": 9829 + }, + { + "epoch": 1.8254410399257197, + "grad_norm": 1.5014641284942627, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8864269852638245, + "num_tokens": 358501154.0, + "step": 9830 + }, + { + "epoch": 1.8256267409470752, + "grad_norm": 1.6082433462142944, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8658982515335083, + "num_tokens": 358536537.0, + "step": 9831 + }, + { + "epoch": 1.8258124419684307, + "grad_norm": 1.5466151237487793, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.890421450138092, + "num_tokens": 358572516.0, + "step": 9832 + }, + { + "epoch": 1.8259981429897865, + "grad_norm": 1.4766677618026733, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8805114030838013, + "num_tokens": 358613880.0, + "step": 9833 + }, + { + "epoch": 1.8261838440111422, + "grad_norm": 1.6342558860778809, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8736667633056641, + "num_tokens": 358650078.0, + "step": 9834 + }, + { + "epoch": 1.8263695450324977, + "grad_norm": 1.4219846725463867, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8934203386306763, + "num_tokens": 358689121.0, + "step": 9835 + }, + { + "epoch": 1.8265552460538532, + "grad_norm": 1.5317144393920898, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8745993971824646, + "num_tokens": 358726989.0, + "step": 9836 + }, + { + "epoch": 1.826740947075209, + "grad_norm": 1.4334074258804321, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8909730911254883, + "num_tokens": 358766293.0, + "step": 9837 + }, + { + "epoch": 1.8269266480965647, + "grad_norm": 1.4754810333251953, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8905106782913208, + "num_tokens": 358800399.0, + "step": 9838 + }, + { + "epoch": 1.8271123491179202, + "grad_norm": 1.5068373680114746, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8745587468147278, + "num_tokens": 358838542.0, + "step": 9839 + }, + { + "epoch": 1.8272980501392757, + "grad_norm": 1.5926361083984375, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8706401586532593, + "num_tokens": 358871392.0, + "step": 9840 + }, + { + "epoch": 1.8274837511606314, + "grad_norm": 1.584216594696045, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8811651468276978, + "num_tokens": 358904210.0, + "step": 9841 + }, + { + "epoch": 1.827669452181987, + "grad_norm": 1.458781123161316, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8903773427009583, + "num_tokens": 358941602.0, + "step": 9842 + }, + { + "epoch": 1.8278551532033425, + "grad_norm": 1.501819372177124, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8916332125663757, + "num_tokens": 358975437.0, + "step": 9843 + }, + { + "epoch": 1.8280408542246982, + "grad_norm": 1.5934184789657593, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8870897889137268, + "num_tokens": 359009089.0, + "step": 9844 + }, + { + "epoch": 1.828226555246054, + "grad_norm": 1.4968641996383667, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8750365972518921, + "num_tokens": 359047668.0, + "step": 9845 + }, + { + "epoch": 1.8284122562674094, + "grad_norm": 1.569176435470581, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8692834377288818, + "num_tokens": 359082613.0, + "step": 9846 + }, + { + "epoch": 1.828597957288765, + "grad_norm": 1.4634417295455933, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8834575414657593, + "num_tokens": 359121539.0, + "step": 9847 + }, + { + "epoch": 1.8287836583101207, + "grad_norm": 1.571553111076355, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8725860714912415, + "num_tokens": 359154994.0, + "step": 9848 + }, + { + "epoch": 1.8289693593314764, + "grad_norm": 1.4500679969787598, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8807576894760132, + "num_tokens": 359193939.0, + "step": 9849 + }, + { + "epoch": 1.829155060352832, + "grad_norm": 1.5769976377487183, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8771076202392578, + "num_tokens": 359228128.0, + "step": 9850 + }, + { + "epoch": 1.8293407613741874, + "grad_norm": 1.4357895851135254, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.88447105884552, + "num_tokens": 359266615.0, + "step": 9851 + }, + { + "epoch": 1.8295264623955432, + "grad_norm": 1.480703592300415, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8706851601600647, + "num_tokens": 359305562.0, + "step": 9852 + }, + { + "epoch": 1.829712163416899, + "grad_norm": 1.3988145589828491, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8878456950187683, + "num_tokens": 359344161.0, + "step": 9853 + }, + { + "epoch": 1.8298978644382544, + "grad_norm": 1.5760977268218994, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8826543688774109, + "num_tokens": 359375664.0, + "step": 9854 + }, + { + "epoch": 1.83008356545961, + "grad_norm": 1.5455259084701538, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8827412128448486, + "num_tokens": 359409998.0, + "step": 9855 + }, + { + "epoch": 1.8302692664809657, + "grad_norm": 1.42279851436615, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8869646787643433, + "num_tokens": 359451035.0, + "step": 9856 + }, + { + "epoch": 1.8304549675023214, + "grad_norm": 1.5731979608535767, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8735480308532715, + "num_tokens": 359485618.0, + "step": 9857 + }, + { + "epoch": 1.830640668523677, + "grad_norm": 1.660332441329956, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8635494709014893, + "num_tokens": 359522758.0, + "step": 9858 + }, + { + "epoch": 1.8308263695450324, + "grad_norm": 1.49968683719635, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8860270977020264, + "num_tokens": 359559052.0, + "step": 9859 + }, + { + "epoch": 1.8310120705663882, + "grad_norm": 1.5398800373077393, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8797771334648132, + "num_tokens": 359593857.0, + "step": 9860 + }, + { + "epoch": 1.8311977715877439, + "grad_norm": 1.4885531663894653, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8846706748008728, + "num_tokens": 359630148.0, + "step": 9861 + }, + { + "epoch": 1.8313834726090994, + "grad_norm": 1.449971318244934, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8886953592300415, + "num_tokens": 359671779.0, + "step": 9862 + }, + { + "epoch": 1.831569173630455, + "grad_norm": 1.4765475988388062, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8834033608436584, + "num_tokens": 359708194.0, + "step": 9863 + }, + { + "epoch": 1.8317548746518106, + "grad_norm": 1.547141432762146, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8766055107116699, + "num_tokens": 359744058.0, + "step": 9864 + }, + { + "epoch": 1.8319405756731661, + "grad_norm": 1.6578696966171265, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8596092462539673, + "num_tokens": 359776138.0, + "step": 9865 + }, + { + "epoch": 1.8321262766945217, + "grad_norm": 1.4165701866149902, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8708279132843018, + "num_tokens": 359815879.0, + "step": 9866 + }, + { + "epoch": 1.8323119777158774, + "grad_norm": 1.50346040725708, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8760362863540649, + "num_tokens": 359855148.0, + "step": 9867 + }, + { + "epoch": 1.8324976787372331, + "grad_norm": 1.403018593788147, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8779618740081787, + "num_tokens": 359895955.0, + "step": 9868 + }, + { + "epoch": 1.8326833797585886, + "grad_norm": 1.541788935661316, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8698439598083496, + "num_tokens": 359931276.0, + "step": 9869 + }, + { + "epoch": 1.8328690807799441, + "grad_norm": 1.4725323915481567, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.881989598274231, + "num_tokens": 359970087.0, + "step": 9870 + }, + { + "epoch": 1.8330547818012999, + "grad_norm": 1.4713845252990723, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8823679685592651, + "num_tokens": 360010762.0, + "step": 9871 + }, + { + "epoch": 1.8332404828226556, + "grad_norm": 1.3299304246902466, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8767199516296387, + "num_tokens": 360055225.0, + "step": 9872 + }, + { + "epoch": 1.8334261838440111, + "grad_norm": 1.5159027576446533, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8652981519699097, + "num_tokens": 360095001.0, + "step": 9873 + }, + { + "epoch": 1.8336118848653666, + "grad_norm": 1.546194314956665, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8794460296630859, + "num_tokens": 360132385.0, + "step": 9874 + }, + { + "epoch": 1.8337975858867224, + "grad_norm": 1.5044726133346558, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8753738403320312, + "num_tokens": 360171288.0, + "step": 9875 + }, + { + "epoch": 1.833983286908078, + "grad_norm": 1.572895884513855, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8786695599555969, + "num_tokens": 360207212.0, + "step": 9876 + }, + { + "epoch": 1.8341689879294336, + "grad_norm": 1.5022094249725342, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8796234130859375, + "num_tokens": 360248581.0, + "step": 9877 + }, + { + "epoch": 1.8343546889507891, + "grad_norm": 1.5000041723251343, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8891547322273254, + "num_tokens": 360283555.0, + "step": 9878 + }, + { + "epoch": 1.8345403899721449, + "grad_norm": 1.666213870048523, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8734484314918518, + "num_tokens": 360316693.0, + "step": 9879 + }, + { + "epoch": 1.8347260909935006, + "grad_norm": 1.527343988418579, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.865445077419281, + "num_tokens": 360354079.0, + "step": 9880 + }, + { + "epoch": 1.834911792014856, + "grad_norm": 1.4382904767990112, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8901476860046387, + "num_tokens": 360390052.0, + "step": 9881 + }, + { + "epoch": 1.8350974930362116, + "grad_norm": 1.5006062984466553, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.889917254447937, + "num_tokens": 360424998.0, + "step": 9882 + }, + { + "epoch": 1.8352831940575673, + "grad_norm": 1.5808501243591309, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8769864439964294, + "num_tokens": 360458975.0, + "step": 9883 + }, + { + "epoch": 1.835468895078923, + "grad_norm": 1.4710594415664673, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8748035430908203, + "num_tokens": 360501014.0, + "step": 9884 + }, + { + "epoch": 1.8356545961002786, + "grad_norm": 1.449578046798706, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.881372332572937, + "num_tokens": 360537476.0, + "step": 9885 + }, + { + "epoch": 1.835840297121634, + "grad_norm": 1.5772840976715088, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8696831464767456, + "num_tokens": 360571756.0, + "step": 9886 + }, + { + "epoch": 1.8360259981429898, + "grad_norm": 1.5011203289031982, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8674218654632568, + "num_tokens": 360611665.0, + "step": 9887 + }, + { + "epoch": 1.8362116991643456, + "grad_norm": 1.6392532587051392, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8918731808662415, + "num_tokens": 360642208.0, + "step": 9888 + }, + { + "epoch": 1.8363974001857009, + "grad_norm": 1.6030900478363037, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8939517736434937, + "num_tokens": 360672975.0, + "step": 9889 + }, + { + "epoch": 1.8365831012070566, + "grad_norm": 1.5018407106399536, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8889198303222656, + "num_tokens": 360707296.0, + "step": 9890 + }, + { + "epoch": 1.8367688022284123, + "grad_norm": 1.4187833070755005, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8961381316184998, + "num_tokens": 360743749.0, + "step": 9891 + }, + { + "epoch": 1.8369545032497678, + "grad_norm": 1.6467976570129395, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8689433336257935, + "num_tokens": 360779881.0, + "step": 9892 + }, + { + "epoch": 1.8371402042711233, + "grad_norm": 1.5790108442306519, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8817969560623169, + "num_tokens": 360814710.0, + "step": 9893 + }, + { + "epoch": 1.837325905292479, + "grad_norm": 1.5425152778625488, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.878574013710022, + "num_tokens": 360852513.0, + "step": 9894 + }, + { + "epoch": 1.8375116063138348, + "grad_norm": 1.5946675539016724, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8723416328430176, + "num_tokens": 360890929.0, + "step": 9895 + }, + { + "epoch": 1.8376973073351903, + "grad_norm": 1.5078670978546143, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8828641772270203, + "num_tokens": 360927087.0, + "step": 9896 + }, + { + "epoch": 1.8378830083565458, + "grad_norm": 1.544297695159912, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8756880760192871, + "num_tokens": 360964123.0, + "step": 9897 + }, + { + "epoch": 1.8380687093779016, + "grad_norm": 1.4841222763061523, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8851271867752075, + "num_tokens": 361000773.0, + "step": 9898 + }, + { + "epoch": 1.8382544103992573, + "grad_norm": 1.5950945615768433, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.880437970161438, + "num_tokens": 361032120.0, + "step": 9899 + }, + { + "epoch": 1.8384401114206128, + "grad_norm": 1.5257447957992554, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8829509615898132, + "num_tokens": 361066051.0, + "step": 9900 + }, + { + "epoch": 1.8386258124419683, + "grad_norm": 1.403117299079895, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8783912658691406, + "num_tokens": 361107811.0, + "step": 9901 + }, + { + "epoch": 1.838811513463324, + "grad_norm": 1.5428316593170166, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8723875284194946, + "num_tokens": 361144487.0, + "step": 9902 + }, + { + "epoch": 1.8389972144846798, + "grad_norm": 1.619012475013733, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.871722936630249, + "num_tokens": 361179898.0, + "step": 9903 + }, + { + "epoch": 1.8391829155060353, + "grad_norm": 1.2892979383468628, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8967964053153992, + "num_tokens": 361223155.0, + "step": 9904 + }, + { + "epoch": 1.8393686165273908, + "grad_norm": 1.7058360576629639, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8556779026985168, + "num_tokens": 361256781.0, + "step": 9905 + }, + { + "epoch": 1.8395543175487465, + "grad_norm": 1.461438775062561, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8715236186981201, + "num_tokens": 361298084.0, + "step": 9906 + }, + { + "epoch": 1.8397400185701023, + "grad_norm": 1.67060387134552, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8829230070114136, + "num_tokens": 361331103.0, + "step": 9907 + }, + { + "epoch": 1.8399257195914578, + "grad_norm": 1.5032042264938354, + "learning_rate": 1e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.8964734077453613, + "num_tokens": 361363099.0, + "step": 9908 + }, + { + "epoch": 1.8401114206128133, + "grad_norm": 1.5819448232650757, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8786282539367676, + "num_tokens": 361398731.0, + "step": 9909 + }, + { + "epoch": 1.840297121634169, + "grad_norm": 1.478747010231018, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.886372447013855, + "num_tokens": 361441444.0, + "step": 9910 + }, + { + "epoch": 1.8404828226555248, + "grad_norm": 1.5649633407592773, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8892595767974854, + "num_tokens": 361474567.0, + "step": 9911 + }, + { + "epoch": 1.8406685236768803, + "grad_norm": 1.5281450748443604, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.880021333694458, + "num_tokens": 361510557.0, + "step": 9912 + }, + { + "epoch": 1.8408542246982358, + "grad_norm": 1.4994275569915771, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.882788360118866, + "num_tokens": 361547065.0, + "step": 9913 + }, + { + "epoch": 1.8410399257195915, + "grad_norm": 1.4533655643463135, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8799141049385071, + "num_tokens": 361586010.0, + "step": 9914 + }, + { + "epoch": 1.841225626740947, + "grad_norm": 1.45637845993042, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8755929470062256, + "num_tokens": 361626873.0, + "step": 9915 + }, + { + "epoch": 1.8414113277623025, + "grad_norm": 1.5338103771209717, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8822302222251892, + "num_tokens": 361662858.0, + "step": 9916 + }, + { + "epoch": 1.8415970287836583, + "grad_norm": 1.5993499755859375, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8754681348800659, + "num_tokens": 361696165.0, + "step": 9917 + }, + { + "epoch": 1.841782729805014, + "grad_norm": 1.6157188415527344, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8608620762825012, + "num_tokens": 361732810.0, + "step": 9918 + }, + { + "epoch": 1.8419684308263695, + "grad_norm": 1.5080015659332275, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8692691922187805, + "num_tokens": 361774390.0, + "step": 9919 + }, + { + "epoch": 1.842154131847725, + "grad_norm": 1.5561597347259521, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8774456977844238, + "num_tokens": 361812772.0, + "step": 9920 + }, + { + "epoch": 1.8423398328690808, + "grad_norm": 1.5923643112182617, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8732051849365234, + "num_tokens": 361847190.0, + "step": 9921 + }, + { + "epoch": 1.8425255338904365, + "grad_norm": 1.4977777004241943, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8972477912902832, + "num_tokens": 361881085.0, + "step": 9922 + }, + { + "epoch": 1.842711234911792, + "grad_norm": 1.5038796663284302, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8812885880470276, + "num_tokens": 361916906.0, + "step": 9923 + }, + { + "epoch": 1.8428969359331475, + "grad_norm": 1.5154982805252075, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8794492483139038, + "num_tokens": 361955240.0, + "step": 9924 + }, + { + "epoch": 1.8430826369545033, + "grad_norm": 1.4633452892303467, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8749383687973022, + "num_tokens": 361996115.0, + "step": 9925 + }, + { + "epoch": 1.843268337975859, + "grad_norm": 1.733411431312561, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8733287453651428, + "num_tokens": 362031996.0, + "step": 9926 + }, + { + "epoch": 1.8434540389972145, + "grad_norm": 1.474995493888855, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8649745583534241, + "num_tokens": 362069539.0, + "step": 9927 + }, + { + "epoch": 1.84363974001857, + "grad_norm": 1.3863866329193115, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.875273585319519, + "num_tokens": 362113030.0, + "step": 9928 + }, + { + "epoch": 1.8438254410399257, + "grad_norm": 1.3556581735610962, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8978660106658936, + "num_tokens": 362154459.0, + "step": 9929 + }, + { + "epoch": 1.8440111420612815, + "grad_norm": 1.5386284589767456, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8830792903900146, + "num_tokens": 362192481.0, + "step": 9930 + }, + { + "epoch": 1.844196843082637, + "grad_norm": 1.5797109603881836, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8751053214073181, + "num_tokens": 362227109.0, + "step": 9931 + }, + { + "epoch": 1.8443825441039925, + "grad_norm": 1.6606895923614502, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8911539316177368, + "num_tokens": 362258378.0, + "step": 9932 + }, + { + "epoch": 1.8445682451253482, + "grad_norm": 1.678308367729187, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8772875070571899, + "num_tokens": 362289261.0, + "step": 9933 + }, + { + "epoch": 1.844753946146704, + "grad_norm": 1.6575496196746826, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8723543882369995, + "num_tokens": 362324545.0, + "step": 9934 + }, + { + "epoch": 1.8449396471680595, + "grad_norm": 1.59573233127594, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8805199265480042, + "num_tokens": 362362377.0, + "step": 9935 + }, + { + "epoch": 1.845125348189415, + "grad_norm": 1.4956985712051392, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8783295154571533, + "num_tokens": 362403673.0, + "step": 9936 + }, + { + "epoch": 1.8453110492107707, + "grad_norm": 1.7068690061569214, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8794203996658325, + "num_tokens": 362436906.0, + "step": 9937 + }, + { + "epoch": 1.8454967502321262, + "grad_norm": 1.6952738761901855, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.885405421257019, + "num_tokens": 362471471.0, + "step": 9938 + }, + { + "epoch": 1.8456824512534817, + "grad_norm": 1.7542469501495361, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8780736923217773, + "num_tokens": 362504815.0, + "step": 9939 + }, + { + "epoch": 1.8458681522748375, + "grad_norm": 1.556793451309204, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8852807283401489, + "num_tokens": 362540252.0, + "step": 9940 + }, + { + "epoch": 1.8460538532961932, + "grad_norm": 1.5806872844696045, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8745879530906677, + "num_tokens": 362574439.0, + "step": 9941 + }, + { + "epoch": 1.8462395543175487, + "grad_norm": 1.5375953912734985, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8930164575576782, + "num_tokens": 362605857.0, + "step": 9942 + }, + { + "epoch": 1.8464252553389042, + "grad_norm": 1.5730534791946411, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8729152679443359, + "num_tokens": 362642400.0, + "step": 9943 + }, + { + "epoch": 1.84661095636026, + "grad_norm": 1.6392438411712646, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8653548955917358, + "num_tokens": 362681819.0, + "step": 9944 + }, + { + "epoch": 1.8467966573816157, + "grad_norm": 1.5086795091629028, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8892009258270264, + "num_tokens": 362720330.0, + "step": 9945 + }, + { + "epoch": 1.8469823584029712, + "grad_norm": 1.6964207887649536, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8678205013275146, + "num_tokens": 362752240.0, + "step": 9946 + }, + { + "epoch": 1.8471680594243267, + "grad_norm": 1.6707385778427124, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8666523694992065, + "num_tokens": 362786618.0, + "step": 9947 + }, + { + "epoch": 1.8473537604456824, + "grad_norm": 1.6698952913284302, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8755651712417603, + "num_tokens": 362822175.0, + "step": 9948 + }, + { + "epoch": 1.8475394614670382, + "grad_norm": 1.503298282623291, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8664673566818237, + "num_tokens": 362859952.0, + "step": 9949 + }, + { + "epoch": 1.8477251624883937, + "grad_norm": 1.6070278882980347, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8835298418998718, + "num_tokens": 362891884.0, + "step": 9950 + }, + { + "epoch": 1.8479108635097492, + "grad_norm": 1.5012348890304565, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8764979839324951, + "num_tokens": 362929377.0, + "step": 9951 + }, + { + "epoch": 1.848096564531105, + "grad_norm": 1.4758330583572388, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8658115863800049, + "num_tokens": 362970871.0, + "step": 9952 + }, + { + "epoch": 1.8482822655524607, + "grad_norm": 1.4612432718276978, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8762574195861816, + "num_tokens": 363006816.0, + "step": 9953 + }, + { + "epoch": 1.8484679665738162, + "grad_norm": 1.487067461013794, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8774365186691284, + "num_tokens": 363042172.0, + "step": 9954 + }, + { + "epoch": 1.8486536675951717, + "grad_norm": 1.4217967987060547, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8814374804496765, + "num_tokens": 363080635.0, + "step": 9955 + }, + { + "epoch": 1.8488393686165274, + "grad_norm": 1.5051331520080566, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8665297031402588, + "num_tokens": 363118669.0, + "step": 9956 + }, + { + "epoch": 1.8490250696378832, + "grad_norm": 1.6059197187423706, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8791353702545166, + "num_tokens": 363150293.0, + "step": 9957 + }, + { + "epoch": 1.8492107706592387, + "grad_norm": 1.484032154083252, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8744163513183594, + "num_tokens": 363190158.0, + "step": 9958 + }, + { + "epoch": 1.8493964716805942, + "grad_norm": 1.5276979207992554, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8726876974105835, + "num_tokens": 363225726.0, + "step": 9959 + }, + { + "epoch": 1.84958217270195, + "grad_norm": 1.562195062637329, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8675855398178101, + "num_tokens": 363264170.0, + "step": 9960 + }, + { + "epoch": 1.8497678737233056, + "grad_norm": 1.4217222929000854, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8746541142463684, + "num_tokens": 363305644.0, + "step": 9961 + }, + { + "epoch": 1.849953574744661, + "grad_norm": 1.6329962015151978, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8725250959396362, + "num_tokens": 363336347.0, + "step": 9962 + }, + { + "epoch": 1.8501392757660167, + "grad_norm": 1.5795552730560303, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8617499470710754, + "num_tokens": 363373461.0, + "step": 9963 + }, + { + "epoch": 1.8503249767873724, + "grad_norm": 1.4449530839920044, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8812063932418823, + "num_tokens": 363415143.0, + "step": 9964 + }, + { + "epoch": 1.850510677808728, + "grad_norm": 1.5586471557617188, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8672608137130737, + "num_tokens": 363452033.0, + "step": 9965 + }, + { + "epoch": 1.8506963788300834, + "grad_norm": 1.5256282091140747, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8789284229278564, + "num_tokens": 363489932.0, + "step": 9966 + }, + { + "epoch": 1.8508820798514392, + "grad_norm": 1.5451641082763672, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8723875284194946, + "num_tokens": 363527258.0, + "step": 9967 + }, + { + "epoch": 1.8510677808727949, + "grad_norm": 1.5393136739730835, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8890092372894287, + "num_tokens": 363561940.0, + "step": 9968 + }, + { + "epoch": 1.8512534818941504, + "grad_norm": 1.4823076725006104, + "learning_rate": 1e-06, + "loss": 0.2737, + "mean_token_accuracy": 0.9027796983718872, + "num_tokens": 363594367.0, + "step": 9969 + }, + { + "epoch": 1.851439182915506, + "grad_norm": 1.5448232889175415, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8717036247253418, + "num_tokens": 363632677.0, + "step": 9970 + }, + { + "epoch": 1.8516248839368616, + "grad_norm": 1.5957820415496826, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8534317016601562, + "num_tokens": 363675528.0, + "step": 9971 + }, + { + "epoch": 1.8518105849582174, + "grad_norm": 1.5542353391647339, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8703969120979309, + "num_tokens": 363710282.0, + "step": 9972 + }, + { + "epoch": 1.8519962859795729, + "grad_norm": 1.5789541006088257, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8688075542449951, + "num_tokens": 363746813.0, + "step": 9973 + }, + { + "epoch": 1.8521819870009284, + "grad_norm": 1.3632997274398804, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.896412193775177, + "num_tokens": 363790008.0, + "step": 9974 + }, + { + "epoch": 1.8523676880222841, + "grad_norm": 1.4700098037719727, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8782488107681274, + "num_tokens": 363827673.0, + "step": 9975 + }, + { + "epoch": 1.8525533890436399, + "grad_norm": 1.5257066488265991, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8850944638252258, + "num_tokens": 363861549.0, + "step": 9976 + }, + { + "epoch": 1.8527390900649954, + "grad_norm": 1.4060271978378296, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8858110308647156, + "num_tokens": 363900275.0, + "step": 9977 + }, + { + "epoch": 1.8529247910863509, + "grad_norm": 1.4857499599456787, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8819271922111511, + "num_tokens": 363939330.0, + "step": 9978 + }, + { + "epoch": 1.8531104921077066, + "grad_norm": 1.4426957368850708, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8848435282707214, + "num_tokens": 363978152.0, + "step": 9979 + }, + { + "epoch": 1.8532961931290624, + "grad_norm": 1.579048752784729, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8714097738265991, + "num_tokens": 364015805.0, + "step": 9980 + }, + { + "epoch": 1.8534818941504179, + "grad_norm": 1.6299622058868408, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8780395984649658, + "num_tokens": 364045937.0, + "step": 9981 + }, + { + "epoch": 1.8536675951717734, + "grad_norm": 1.6238985061645508, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8662813305854797, + "num_tokens": 364080707.0, + "step": 9982 + }, + { + "epoch": 1.853853296193129, + "grad_norm": 1.484691858291626, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8644748330116272, + "num_tokens": 364120933.0, + "step": 9983 + }, + { + "epoch": 1.8540389972144848, + "grad_norm": 1.4627103805541992, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8917837142944336, + "num_tokens": 364163632.0, + "step": 9984 + }, + { + "epoch": 1.8542246982358404, + "grad_norm": 1.693666934967041, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8801702260971069, + "num_tokens": 364193890.0, + "step": 9985 + }, + { + "epoch": 1.8544103992571959, + "grad_norm": 1.5363763570785522, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8734503984451294, + "num_tokens": 364231489.0, + "step": 9986 + }, + { + "epoch": 1.8545961002785516, + "grad_norm": 1.5365288257598877, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8893468379974365, + "num_tokens": 364267842.0, + "step": 9987 + }, + { + "epoch": 1.854781801299907, + "grad_norm": 1.4921046495437622, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8594539165496826, + "num_tokens": 364307807.0, + "step": 9988 + }, + { + "epoch": 1.8549675023212626, + "grad_norm": 1.551958441734314, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8766776919364929, + "num_tokens": 364343948.0, + "step": 9989 + }, + { + "epoch": 1.8551532033426184, + "grad_norm": 1.6695966720581055, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8629833459854126, + "num_tokens": 364379851.0, + "step": 9990 + }, + { + "epoch": 1.855338904363974, + "grad_norm": 1.608124017715454, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8788480758666992, + "num_tokens": 364414453.0, + "step": 9991 + }, + { + "epoch": 1.8555246053853296, + "grad_norm": 1.5562686920166016, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8784571290016174, + "num_tokens": 364449385.0, + "step": 9992 + }, + { + "epoch": 1.855710306406685, + "grad_norm": 1.5013521909713745, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8866146206855774, + "num_tokens": 364488225.0, + "step": 9993 + }, + { + "epoch": 1.8558960074280408, + "grad_norm": 1.7218414545059204, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8929815888404846, + "num_tokens": 364514365.0, + "step": 9994 + }, + { + "epoch": 1.8560817084493966, + "grad_norm": 1.6649062633514404, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.866266667842865, + "num_tokens": 364549741.0, + "step": 9995 + }, + { + "epoch": 1.856267409470752, + "grad_norm": 1.4948704242706299, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8926789164543152, + "num_tokens": 364582688.0, + "step": 9996 + }, + { + "epoch": 1.8564531104921076, + "grad_norm": 1.6014074087142944, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8872987031936646, + "num_tokens": 364616671.0, + "step": 9997 + }, + { + "epoch": 1.8566388115134633, + "grad_norm": 1.6515153646469116, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8710716962814331, + "num_tokens": 364650638.0, + "step": 9998 + }, + { + "epoch": 1.856824512534819, + "grad_norm": 1.6586989164352417, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8747137784957886, + "num_tokens": 364682422.0, + "step": 9999 + }, + { + "epoch": 1.8570102135561746, + "grad_norm": 1.510799527168274, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8672397136688232, + "num_tokens": 364717865.0, + "step": 10000 + }, + { + "epoch": 1.85719591457753, + "grad_norm": 1.5480674505233765, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8827579021453857, + "num_tokens": 364752984.0, + "step": 10001 + }, + { + "epoch": 1.8573816155988858, + "grad_norm": 1.501042127609253, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8672254085540771, + "num_tokens": 364792979.0, + "step": 10002 + }, + { + "epoch": 1.8575673166202415, + "grad_norm": 1.605122447013855, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8797512650489807, + "num_tokens": 364825422.0, + "step": 10003 + }, + { + "epoch": 1.857753017641597, + "grad_norm": 1.4969202280044556, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8657851219177246, + "num_tokens": 364865370.0, + "step": 10004 + }, + { + "epoch": 1.8579387186629526, + "grad_norm": 1.7203266620635986, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.879355251789093, + "num_tokens": 364897282.0, + "step": 10005 + }, + { + "epoch": 1.8581244196843083, + "grad_norm": 1.4675298929214478, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8869122266769409, + "num_tokens": 364934559.0, + "step": 10006 + }, + { + "epoch": 1.858310120705664, + "grad_norm": 1.5608189105987549, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8878769278526306, + "num_tokens": 364967468.0, + "step": 10007 + }, + { + "epoch": 1.8584958217270195, + "grad_norm": 1.6163736581802368, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8655695915222168, + "num_tokens": 365001729.0, + "step": 10008 + }, + { + "epoch": 1.858681522748375, + "grad_norm": 1.5342692136764526, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8822852373123169, + "num_tokens": 365034512.0, + "step": 10009 + }, + { + "epoch": 1.8588672237697308, + "grad_norm": 1.5053294897079468, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.877826452255249, + "num_tokens": 365071981.0, + "step": 10010 + }, + { + "epoch": 1.8590529247910863, + "grad_norm": 1.6034709215164185, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8695454597473145, + "num_tokens": 365106564.0, + "step": 10011 + }, + { + "epoch": 1.8592386258124418, + "grad_norm": 1.4216735363006592, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8793045282363892, + "num_tokens": 365147234.0, + "step": 10012 + }, + { + "epoch": 1.8594243268337975, + "grad_norm": 1.3622040748596191, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8748899698257446, + "num_tokens": 365190164.0, + "step": 10013 + }, + { + "epoch": 1.8596100278551533, + "grad_norm": 1.5040276050567627, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8885729312896729, + "num_tokens": 365223817.0, + "step": 10014 + }, + { + "epoch": 1.8597957288765088, + "grad_norm": 1.4866690635681152, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8834383487701416, + "num_tokens": 365259904.0, + "step": 10015 + }, + { + "epoch": 1.8599814298978643, + "grad_norm": 1.3687303066253662, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8835859894752502, + "num_tokens": 365298610.0, + "step": 10016 + }, + { + "epoch": 1.86016713091922, + "grad_norm": 1.623542070388794, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8724334239959717, + "num_tokens": 365332228.0, + "step": 10017 + }, + { + "epoch": 1.8603528319405758, + "grad_norm": 1.6873559951782227, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8914137482643127, + "num_tokens": 365359365.0, + "step": 10018 + }, + { + "epoch": 1.8605385329619313, + "grad_norm": 1.4031217098236084, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8858290910720825, + "num_tokens": 365399865.0, + "step": 10019 + }, + { + "epoch": 1.8607242339832868, + "grad_norm": 1.5339537858963013, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8836847543716431, + "num_tokens": 365438378.0, + "step": 10020 + }, + { + "epoch": 1.8609099350046425, + "grad_norm": 1.4942073822021484, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8680975437164307, + "num_tokens": 365477296.0, + "step": 10021 + }, + { + "epoch": 1.8610956360259983, + "grad_norm": 1.570199966430664, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.875218391418457, + "num_tokens": 365513509.0, + "step": 10022 + }, + { + "epoch": 1.8612813370473538, + "grad_norm": 1.5039561986923218, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8794862031936646, + "num_tokens": 365550453.0, + "step": 10023 + }, + { + "epoch": 1.8614670380687093, + "grad_norm": 1.727934718132019, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8746997117996216, + "num_tokens": 365583463.0, + "step": 10024 + }, + { + "epoch": 1.861652739090065, + "grad_norm": 1.539419412612915, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8780753016471863, + "num_tokens": 365618099.0, + "step": 10025 + }, + { + "epoch": 1.8618384401114207, + "grad_norm": 1.4273550510406494, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8805794715881348, + "num_tokens": 365658717.0, + "step": 10026 + }, + { + "epoch": 1.8620241411327763, + "grad_norm": 1.4638088941574097, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8943984508514404, + "num_tokens": 365693224.0, + "step": 10027 + }, + { + "epoch": 1.8622098421541318, + "grad_norm": 1.4783692359924316, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8815509676933289, + "num_tokens": 365730441.0, + "step": 10028 + }, + { + "epoch": 1.8623955431754875, + "grad_norm": 1.658089280128479, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8653749227523804, + "num_tokens": 365764300.0, + "step": 10029 + }, + { + "epoch": 1.8625812441968432, + "grad_norm": 1.4565424919128418, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8647501468658447, + "num_tokens": 365803354.0, + "step": 10030 + }, + { + "epoch": 1.8627669452181987, + "grad_norm": 1.8525850772857666, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8726310729980469, + "num_tokens": 365837791.0, + "step": 10031 + }, + { + "epoch": 1.8629526462395543, + "grad_norm": 1.5256224870681763, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8923507928848267, + "num_tokens": 365870124.0, + "step": 10032 + }, + { + "epoch": 1.86313834726091, + "grad_norm": 1.47820246219635, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8962507247924805, + "num_tokens": 365902778.0, + "step": 10033 + }, + { + "epoch": 1.8633240482822655, + "grad_norm": 1.4360865354537964, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8854000568389893, + "num_tokens": 365941684.0, + "step": 10034 + }, + { + "epoch": 1.863509749303621, + "grad_norm": 1.548535704612732, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8788161873817444, + "num_tokens": 365976799.0, + "step": 10035 + }, + { + "epoch": 1.8636954503249767, + "grad_norm": 1.499686360359192, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8784331679344177, + "num_tokens": 366017555.0, + "step": 10036 + }, + { + "epoch": 1.8638811513463325, + "grad_norm": 1.5457154512405396, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8863471746444702, + "num_tokens": 366053628.0, + "step": 10037 + }, + { + "epoch": 1.864066852367688, + "grad_norm": 1.583512783050537, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8738425970077515, + "num_tokens": 366088526.0, + "step": 10038 + }, + { + "epoch": 1.8642525533890435, + "grad_norm": 1.443572998046875, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8773356080055237, + "num_tokens": 366129419.0, + "step": 10039 + }, + { + "epoch": 1.8644382544103992, + "grad_norm": 1.501839518547058, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8894661068916321, + "num_tokens": 366164991.0, + "step": 10040 + }, + { + "epoch": 1.864623955431755, + "grad_norm": 1.4723421335220337, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8853756785392761, + "num_tokens": 366202229.0, + "step": 10041 + }, + { + "epoch": 1.8648096564531105, + "grad_norm": 1.4401047229766846, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8780525922775269, + "num_tokens": 366241443.0, + "step": 10042 + }, + { + "epoch": 1.864995357474466, + "grad_norm": 1.5300629138946533, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8778069615364075, + "num_tokens": 366276307.0, + "step": 10043 + }, + { + "epoch": 1.8651810584958217, + "grad_norm": 1.5197674036026, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8686726093292236, + "num_tokens": 366313635.0, + "step": 10044 + }, + { + "epoch": 1.8653667595171775, + "grad_norm": 1.5441169738769531, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8901739716529846, + "num_tokens": 366347620.0, + "step": 10045 + }, + { + "epoch": 1.865552460538533, + "grad_norm": 1.5394079685211182, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8886409997940063, + "num_tokens": 366380613.0, + "step": 10046 + }, + { + "epoch": 1.8657381615598885, + "grad_norm": 1.5942639112472534, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8789768218994141, + "num_tokens": 366412520.0, + "step": 10047 + }, + { + "epoch": 1.8659238625812442, + "grad_norm": 1.5052874088287354, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8860231637954712, + "num_tokens": 366445887.0, + "step": 10048 + }, + { + "epoch": 1.8661095636026, + "grad_norm": 1.6164448261260986, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8737984299659729, + "num_tokens": 366480610.0, + "step": 10049 + }, + { + "epoch": 1.8662952646239555, + "grad_norm": 1.664119005203247, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8807512521743774, + "num_tokens": 366510925.0, + "step": 10050 + }, + { + "epoch": 1.866480965645311, + "grad_norm": 1.604073405265808, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8697230815887451, + "num_tokens": 366542784.0, + "step": 10051 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 1.5200179815292358, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8906456828117371, + "num_tokens": 366573617.0, + "step": 10052 + }, + { + "epoch": 1.8668523676880224, + "grad_norm": 1.5411728620529175, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8672882318496704, + "num_tokens": 366613468.0, + "step": 10053 + }, + { + "epoch": 1.867038068709378, + "grad_norm": 1.5179940462112427, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8843599557876587, + "num_tokens": 366649703.0, + "step": 10054 + }, + { + "epoch": 1.8672237697307335, + "grad_norm": 1.915067434310913, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8747679591178894, + "num_tokens": 366677125.0, + "step": 10055 + }, + { + "epoch": 1.8674094707520892, + "grad_norm": 1.7541707754135132, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8679115772247314, + "num_tokens": 366712852.0, + "step": 10056 + }, + { + "epoch": 1.867595171773445, + "grad_norm": 1.5270336866378784, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8651517629623413, + "num_tokens": 366755237.0, + "step": 10057 + }, + { + "epoch": 1.8677808727948002, + "grad_norm": 1.5810902118682861, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8808603882789612, + "num_tokens": 366791148.0, + "step": 10058 + }, + { + "epoch": 1.867966573816156, + "grad_norm": 1.6065808534622192, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8635966777801514, + "num_tokens": 366829961.0, + "step": 10059 + }, + { + "epoch": 1.8681522748375117, + "grad_norm": 1.438503384590149, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8813732862472534, + "num_tokens": 366869881.0, + "step": 10060 + }, + { + "epoch": 1.8683379758588672, + "grad_norm": 1.5920151472091675, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8667519688606262, + "num_tokens": 366904777.0, + "step": 10061 + }, + { + "epoch": 1.8685236768802227, + "grad_norm": 1.479077935218811, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8757019639015198, + "num_tokens": 366943595.0, + "step": 10062 + }, + { + "epoch": 1.8687093779015784, + "grad_norm": 1.56047785282135, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8740415573120117, + "num_tokens": 366977774.0, + "step": 10063 + }, + { + "epoch": 1.8688950789229342, + "grad_norm": 1.5260363817214966, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8751176595687866, + "num_tokens": 367016352.0, + "step": 10064 + }, + { + "epoch": 1.8690807799442897, + "grad_norm": 1.5237257480621338, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8813579082489014, + "num_tokens": 367052565.0, + "step": 10065 + }, + { + "epoch": 1.8692664809656452, + "grad_norm": 1.614115834236145, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.889549970626831, + "num_tokens": 367083808.0, + "step": 10066 + }, + { + "epoch": 1.869452181987001, + "grad_norm": 1.409075379371643, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8954817056655884, + "num_tokens": 367120042.0, + "step": 10067 + }, + { + "epoch": 1.8696378830083566, + "grad_norm": 1.6509029865264893, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8847813606262207, + "num_tokens": 367150762.0, + "step": 10068 + }, + { + "epoch": 1.8698235840297122, + "grad_norm": 1.4280661344528198, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8807114362716675, + "num_tokens": 367194662.0, + "step": 10069 + }, + { + "epoch": 1.8700092850510677, + "grad_norm": 1.5645378828048706, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8916317224502563, + "num_tokens": 367227518.0, + "step": 10070 + }, + { + "epoch": 1.8701949860724234, + "grad_norm": 1.6031697988510132, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8820709586143494, + "num_tokens": 367266827.0, + "step": 10071 + }, + { + "epoch": 1.8703806870937791, + "grad_norm": 1.484238862991333, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8862271308898926, + "num_tokens": 367304137.0, + "step": 10072 + }, + { + "epoch": 1.8705663881151346, + "grad_norm": 1.8630211353302002, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8719377517700195, + "num_tokens": 367332672.0, + "step": 10073 + }, + { + "epoch": 1.8707520891364902, + "grad_norm": 1.5761666297912598, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8823182582855225, + "num_tokens": 367368678.0, + "step": 10074 + }, + { + "epoch": 1.870937790157846, + "grad_norm": 1.565821647644043, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8772813081741333, + "num_tokens": 367405175.0, + "step": 10075 + }, + { + "epoch": 1.8711234911792016, + "grad_norm": 1.4564377069473267, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8823122382164001, + "num_tokens": 367445385.0, + "step": 10076 + }, + { + "epoch": 1.8713091922005571, + "grad_norm": 1.6061973571777344, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8625251650810242, + "num_tokens": 367480194.0, + "step": 10077 + }, + { + "epoch": 1.8714948932219126, + "grad_norm": 1.5466629266738892, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8754005432128906, + "num_tokens": 367517787.0, + "step": 10078 + }, + { + "epoch": 1.8716805942432684, + "grad_norm": 1.527410864830017, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.88484126329422, + "num_tokens": 367554222.0, + "step": 10079 + }, + { + "epoch": 1.8718662952646241, + "grad_norm": 1.5411583185195923, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8712850213050842, + "num_tokens": 367595054.0, + "step": 10080 + }, + { + "epoch": 1.8720519962859796, + "grad_norm": 1.3975192308425903, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8903348445892334, + "num_tokens": 367634499.0, + "step": 10081 + }, + { + "epoch": 1.8722376973073351, + "grad_norm": 1.406316876411438, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8936402201652527, + "num_tokens": 367672833.0, + "step": 10082 + }, + { + "epoch": 1.8724233983286909, + "grad_norm": 1.6662325859069824, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8723204731941223, + "num_tokens": 367703759.0, + "step": 10083 + }, + { + "epoch": 1.8726090993500464, + "grad_norm": 1.5040998458862305, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8909735679626465, + "num_tokens": 367738747.0, + "step": 10084 + }, + { + "epoch": 1.872794800371402, + "grad_norm": 1.6209793090820312, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8924945592880249, + "num_tokens": 367768042.0, + "step": 10085 + }, + { + "epoch": 1.8729805013927576, + "grad_norm": 1.35050630569458, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8806201219558716, + "num_tokens": 367811156.0, + "step": 10086 + }, + { + "epoch": 1.8731662024141134, + "grad_norm": 1.5022271871566772, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8793960809707642, + "num_tokens": 367845890.0, + "step": 10087 + }, + { + "epoch": 1.8733519034354689, + "grad_norm": 1.4876501560211182, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8791859149932861, + "num_tokens": 367885287.0, + "step": 10088 + }, + { + "epoch": 1.8735376044568244, + "grad_norm": 1.6139923334121704, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8775047063827515, + "num_tokens": 367919908.0, + "step": 10089 + }, + { + "epoch": 1.8737233054781801, + "grad_norm": 1.6472707986831665, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8736006021499634, + "num_tokens": 367951908.0, + "step": 10090 + }, + { + "epoch": 1.8739090064995358, + "grad_norm": 1.5300352573394775, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8830949068069458, + "num_tokens": 367985414.0, + "step": 10091 + }, + { + "epoch": 1.8740947075208914, + "grad_norm": 1.3918288946151733, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8890467286109924, + "num_tokens": 368024228.0, + "step": 10092 + }, + { + "epoch": 1.8742804085422469, + "grad_norm": 1.4908753633499146, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8896896839141846, + "num_tokens": 368060342.0, + "step": 10093 + }, + { + "epoch": 1.8744661095636026, + "grad_norm": 1.385848045349121, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8853956460952759, + "num_tokens": 368101814.0, + "step": 10094 + }, + { + "epoch": 1.8746518105849583, + "grad_norm": 1.4006565809249878, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8754919767379761, + "num_tokens": 368141038.0, + "step": 10095 + }, + { + "epoch": 1.8748375116063138, + "grad_norm": 1.5621930360794067, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8739792108535767, + "num_tokens": 368174677.0, + "step": 10096 + }, + { + "epoch": 1.8750232126276694, + "grad_norm": 1.425824761390686, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8855127096176147, + "num_tokens": 368214112.0, + "step": 10097 + }, + { + "epoch": 1.875208913649025, + "grad_norm": 1.6374540328979492, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.877387285232544, + "num_tokens": 368247792.0, + "step": 10098 + }, + { + "epoch": 1.8753946146703808, + "grad_norm": 1.5045229196548462, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8863235116004944, + "num_tokens": 368284237.0, + "step": 10099 + }, + { + "epoch": 1.8755803156917363, + "grad_norm": 1.5286473035812378, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8791573643684387, + "num_tokens": 368318185.0, + "step": 10100 + }, + { + "epoch": 1.8757660167130918, + "grad_norm": 1.6528493165969849, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8883719444274902, + "num_tokens": 368348228.0, + "step": 10101 + }, + { + "epoch": 1.8759517177344476, + "grad_norm": 1.5807839632034302, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8901634812355042, + "num_tokens": 368384524.0, + "step": 10102 + }, + { + "epoch": 1.8761374187558033, + "grad_norm": 1.5732080936431885, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8821067810058594, + "num_tokens": 368419380.0, + "step": 10103 + }, + { + "epoch": 1.8763231197771588, + "grad_norm": 1.4618271589279175, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8619514107704163, + "num_tokens": 368462844.0, + "step": 10104 + }, + { + "epoch": 1.8765088207985143, + "grad_norm": 1.5667320489883423, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.862910270690918, + "num_tokens": 368501673.0, + "step": 10105 + }, + { + "epoch": 1.87669452181987, + "grad_norm": 1.6342016458511353, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8809321522712708, + "num_tokens": 368534751.0, + "step": 10106 + }, + { + "epoch": 1.8768802228412256, + "grad_norm": 1.5099722146987915, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8765956163406372, + "num_tokens": 368572661.0, + "step": 10107 + }, + { + "epoch": 1.877065923862581, + "grad_norm": 1.5351520776748657, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8636630177497864, + "num_tokens": 368610296.0, + "step": 10108 + }, + { + "epoch": 1.8772516248839368, + "grad_norm": 1.5544825792312622, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8688533306121826, + "num_tokens": 368646194.0, + "step": 10109 + }, + { + "epoch": 1.8774373259052926, + "grad_norm": 1.4378869533538818, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8910114169120789, + "num_tokens": 368680749.0, + "step": 10110 + }, + { + "epoch": 1.877623026926648, + "grad_norm": 1.4922680854797363, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8895151615142822, + "num_tokens": 368715040.0, + "step": 10111 + }, + { + "epoch": 1.8778087279480036, + "grad_norm": 1.507973074913025, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8809173107147217, + "num_tokens": 368753933.0, + "step": 10112 + }, + { + "epoch": 1.8779944289693593, + "grad_norm": 1.5372892618179321, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8873871564865112, + "num_tokens": 368790213.0, + "step": 10113 + }, + { + "epoch": 1.878180129990715, + "grad_norm": 1.4696053266525269, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.879715085029602, + "num_tokens": 368827505.0, + "step": 10114 + }, + { + "epoch": 1.8783658310120706, + "grad_norm": 1.3822076320648193, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8747532963752747, + "num_tokens": 368869745.0, + "step": 10115 + }, + { + "epoch": 1.878551532033426, + "grad_norm": 1.5747323036193848, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8767948746681213, + "num_tokens": 368909541.0, + "step": 10116 + }, + { + "epoch": 1.8787372330547818, + "grad_norm": 1.7475076913833618, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8762289881706238, + "num_tokens": 368939180.0, + "step": 10117 + }, + { + "epoch": 1.8789229340761375, + "grad_norm": 1.4081544876098633, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8862379193305969, + "num_tokens": 368980967.0, + "step": 10118 + }, + { + "epoch": 1.879108635097493, + "grad_norm": 1.6570154428482056, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8833247423171997, + "num_tokens": 369013431.0, + "step": 10119 + }, + { + "epoch": 1.8792943361188486, + "grad_norm": 1.5856842994689941, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8791743516921997, + "num_tokens": 369048445.0, + "step": 10120 + }, + { + "epoch": 1.8794800371402043, + "grad_norm": 1.5986049175262451, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8791204690933228, + "num_tokens": 369083421.0, + "step": 10121 + }, + { + "epoch": 1.87966573816156, + "grad_norm": 1.5762109756469727, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8654664754867554, + "num_tokens": 369117672.0, + "step": 10122 + }, + { + "epoch": 1.8798514391829155, + "grad_norm": 1.5514020919799805, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8834914565086365, + "num_tokens": 369153861.0, + "step": 10123 + }, + { + "epoch": 1.880037140204271, + "grad_norm": 1.393804907798767, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8803977966308594, + "num_tokens": 369194843.0, + "step": 10124 + }, + { + "epoch": 1.8802228412256268, + "grad_norm": 1.5015733242034912, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.875436007976532, + "num_tokens": 369231272.0, + "step": 10125 + }, + { + "epoch": 1.8804085422469825, + "grad_norm": 1.6256303787231445, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8833264112472534, + "num_tokens": 369263397.0, + "step": 10126 + }, + { + "epoch": 1.880594243268338, + "grad_norm": 1.5475480556488037, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8788648247718811, + "num_tokens": 369300605.0, + "step": 10127 + }, + { + "epoch": 1.8807799442896935, + "grad_norm": 1.6083279848098755, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8759742975234985, + "num_tokens": 369332015.0, + "step": 10128 + }, + { + "epoch": 1.8809656453110493, + "grad_norm": 1.4924989938735962, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8837769627571106, + "num_tokens": 369373316.0, + "step": 10129 + }, + { + "epoch": 1.881151346332405, + "grad_norm": 1.6446857452392578, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8743820190429688, + "num_tokens": 369408137.0, + "step": 10130 + }, + { + "epoch": 1.8813370473537603, + "grad_norm": 1.4788788557052612, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8918972611427307, + "num_tokens": 369441365.0, + "step": 10131 + }, + { + "epoch": 1.881522748375116, + "grad_norm": 1.6546889543533325, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8751912713050842, + "num_tokens": 369472874.0, + "step": 10132 + }, + { + "epoch": 1.8817084493964717, + "grad_norm": 1.4791463613510132, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8773800730705261, + "num_tokens": 369508281.0, + "step": 10133 + }, + { + "epoch": 1.8818941504178273, + "grad_norm": 1.5483795404434204, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8739331960678101, + "num_tokens": 369542003.0, + "step": 10134 + }, + { + "epoch": 1.8820798514391828, + "grad_norm": 1.4273529052734375, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.8994476199150085, + "num_tokens": 369575708.0, + "step": 10135 + }, + { + "epoch": 1.8822655524605385, + "grad_norm": 1.5783884525299072, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8762835264205933, + "num_tokens": 369611100.0, + "step": 10136 + }, + { + "epoch": 1.8824512534818942, + "grad_norm": 1.564117193222046, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8985605239868164, + "num_tokens": 369644338.0, + "step": 10137 + }, + { + "epoch": 1.8826369545032497, + "grad_norm": 1.4876210689544678, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8688777089118958, + "num_tokens": 369682918.0, + "step": 10138 + }, + { + "epoch": 1.8828226555246053, + "grad_norm": 1.613455057144165, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.884791910648346, + "num_tokens": 369715957.0, + "step": 10139 + }, + { + "epoch": 1.883008356545961, + "grad_norm": 1.4675285816192627, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8911114931106567, + "num_tokens": 369757641.0, + "step": 10140 + }, + { + "epoch": 1.8831940575673167, + "grad_norm": 1.7335174083709717, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8813678622245789, + "num_tokens": 369788080.0, + "step": 10141 + }, + { + "epoch": 1.8833797585886722, + "grad_norm": 1.51663076877594, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8768219947814941, + "num_tokens": 369824331.0, + "step": 10142 + }, + { + "epoch": 1.8835654596100277, + "grad_norm": 1.5269478559494019, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8900189399719238, + "num_tokens": 369859043.0, + "step": 10143 + }, + { + "epoch": 1.8837511606313835, + "grad_norm": 1.5198885202407837, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8903019428253174, + "num_tokens": 369893328.0, + "step": 10144 + }, + { + "epoch": 1.8839368616527392, + "grad_norm": 1.5606083869934082, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8662745952606201, + "num_tokens": 369933109.0, + "step": 10145 + }, + { + "epoch": 1.8841225626740947, + "grad_norm": 1.5475237369537354, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8858606815338135, + "num_tokens": 369965780.0, + "step": 10146 + }, + { + "epoch": 1.8843082636954502, + "grad_norm": 1.5426348447799683, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8798931241035461, + "num_tokens": 370001570.0, + "step": 10147 + }, + { + "epoch": 1.884493964716806, + "grad_norm": 1.458251714706421, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8734972476959229, + "num_tokens": 370042011.0, + "step": 10148 + }, + { + "epoch": 1.8846796657381617, + "grad_norm": 1.6534367799758911, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.877643346786499, + "num_tokens": 370074417.0, + "step": 10149 + }, + { + "epoch": 1.8848653667595172, + "grad_norm": 1.4823949337005615, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8836784362792969, + "num_tokens": 370115262.0, + "step": 10150 + }, + { + "epoch": 1.8850510677808727, + "grad_norm": 1.5305331945419312, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.883868396282196, + "num_tokens": 370152413.0, + "step": 10151 + }, + { + "epoch": 1.8852367688022285, + "grad_norm": 1.4576454162597656, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8712394833564758, + "num_tokens": 370194372.0, + "step": 10152 + }, + { + "epoch": 1.8854224698235842, + "grad_norm": 1.660703420639038, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.891600489616394, + "num_tokens": 370224788.0, + "step": 10153 + }, + { + "epoch": 1.8856081708449397, + "grad_norm": 1.5200189352035522, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8779830932617188, + "num_tokens": 370265588.0, + "step": 10154 + }, + { + "epoch": 1.8857938718662952, + "grad_norm": 1.7154675722122192, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8753659725189209, + "num_tokens": 370296763.0, + "step": 10155 + }, + { + "epoch": 1.885979572887651, + "grad_norm": 1.470892071723938, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8650990724563599, + "num_tokens": 370340995.0, + "step": 10156 + }, + { + "epoch": 1.8861652739090065, + "grad_norm": 1.444000005722046, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8876550197601318, + "num_tokens": 370376460.0, + "step": 10157 + }, + { + "epoch": 1.886350974930362, + "grad_norm": 1.493772268295288, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8687576055526733, + "num_tokens": 370419130.0, + "step": 10158 + }, + { + "epoch": 1.8865366759517177, + "grad_norm": 1.4900448322296143, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8603228330612183, + "num_tokens": 370459744.0, + "step": 10159 + }, + { + "epoch": 1.8867223769730734, + "grad_norm": 1.4769351482391357, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8917603492736816, + "num_tokens": 370494306.0, + "step": 10160 + }, + { + "epoch": 1.886908077994429, + "grad_norm": 1.5254154205322266, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8897548913955688, + "num_tokens": 370527545.0, + "step": 10161 + }, + { + "epoch": 1.8870937790157845, + "grad_norm": 1.4915927648544312, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.887746274471283, + "num_tokens": 370563994.0, + "step": 10162 + }, + { + "epoch": 1.8872794800371402, + "grad_norm": 1.679118037223816, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8736388683319092, + "num_tokens": 370595008.0, + "step": 10163 + }, + { + "epoch": 1.887465181058496, + "grad_norm": 1.5376849174499512, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8785534501075745, + "num_tokens": 370629538.0, + "step": 10164 + }, + { + "epoch": 1.8876508820798514, + "grad_norm": 1.4979356527328491, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8851784467697144, + "num_tokens": 370665978.0, + "step": 10165 + }, + { + "epoch": 1.887836583101207, + "grad_norm": 1.5663955211639404, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8768179416656494, + "num_tokens": 370701007.0, + "step": 10166 + }, + { + "epoch": 1.8880222841225627, + "grad_norm": 1.4425573348999023, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8886010646820068, + "num_tokens": 370737194.0, + "step": 10167 + }, + { + "epoch": 1.8882079851439184, + "grad_norm": 1.556548833847046, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8855724930763245, + "num_tokens": 370772490.0, + "step": 10168 + }, + { + "epoch": 1.888393686165274, + "grad_norm": 1.5947149991989136, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8686459064483643, + "num_tokens": 370809302.0, + "step": 10169 + }, + { + "epoch": 1.8885793871866294, + "grad_norm": 1.5639739036560059, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.875298261642456, + "num_tokens": 370842661.0, + "step": 10170 + }, + { + "epoch": 1.8887650882079852, + "grad_norm": 1.6425927877426147, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8719048500061035, + "num_tokens": 370878360.0, + "step": 10171 + }, + { + "epoch": 1.888950789229341, + "grad_norm": 1.5478993654251099, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.887215793132782, + "num_tokens": 370914349.0, + "step": 10172 + }, + { + "epoch": 1.8891364902506964, + "grad_norm": 1.5580201148986816, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8889030814170837, + "num_tokens": 370947079.0, + "step": 10173 + }, + { + "epoch": 1.889322191272052, + "grad_norm": 1.4350472688674927, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8718761205673218, + "num_tokens": 370989647.0, + "step": 10174 + }, + { + "epoch": 1.8895078922934077, + "grad_norm": 1.483454704284668, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8850156664848328, + "num_tokens": 371030956.0, + "step": 10175 + }, + { + "epoch": 1.8896935933147634, + "grad_norm": 1.514605164527893, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8760159611701965, + "num_tokens": 371068033.0, + "step": 10176 + }, + { + "epoch": 1.889879294336119, + "grad_norm": 1.424652099609375, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.880937933921814, + "num_tokens": 371105748.0, + "step": 10177 + }, + { + "epoch": 1.8900649953574744, + "grad_norm": 1.4886157512664795, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8758484125137329, + "num_tokens": 371143927.0, + "step": 10178 + }, + { + "epoch": 1.8902506963788301, + "grad_norm": 1.4535295963287354, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8827768564224243, + "num_tokens": 371180708.0, + "step": 10179 + }, + { + "epoch": 1.8904363974001857, + "grad_norm": 1.4515265226364136, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8864884972572327, + "num_tokens": 371218511.0, + "step": 10180 + }, + { + "epoch": 1.8906220984215412, + "grad_norm": 1.5926486253738403, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8710159063339233, + "num_tokens": 371254867.0, + "step": 10181 + }, + { + "epoch": 1.890807799442897, + "grad_norm": 1.473523736000061, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8758279085159302, + "num_tokens": 371294451.0, + "step": 10182 + }, + { + "epoch": 1.8909935004642526, + "grad_norm": 1.5054644346237183, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8831318616867065, + "num_tokens": 371330854.0, + "step": 10183 + }, + { + "epoch": 1.8911792014856081, + "grad_norm": 1.371382236480713, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8924156427383423, + "num_tokens": 371371869.0, + "step": 10184 + }, + { + "epoch": 1.8913649025069637, + "grad_norm": 1.3972870111465454, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.887136697769165, + "num_tokens": 371411465.0, + "step": 10185 + }, + { + "epoch": 1.8915506035283194, + "grad_norm": 1.7767637968063354, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8779820203781128, + "num_tokens": 371441733.0, + "step": 10186 + }, + { + "epoch": 1.8917363045496751, + "grad_norm": 1.6723997592926025, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8761463165283203, + "num_tokens": 371475087.0, + "step": 10187 + }, + { + "epoch": 1.8919220055710306, + "grad_norm": 1.555069923400879, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8716539144515991, + "num_tokens": 371513792.0, + "step": 10188 + }, + { + "epoch": 1.8921077065923861, + "grad_norm": 1.5723856687545776, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8831965327262878, + "num_tokens": 371547581.0, + "step": 10189 + }, + { + "epoch": 1.8922934076137419, + "grad_norm": 1.5349130630493164, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8838568925857544, + "num_tokens": 371581872.0, + "step": 10190 + }, + { + "epoch": 1.8924791086350976, + "grad_norm": 1.4957847595214844, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8914965987205505, + "num_tokens": 371618529.0, + "step": 10191 + }, + { + "epoch": 1.8926648096564531, + "grad_norm": 1.6893606185913086, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8794803619384766, + "num_tokens": 371649611.0, + "step": 10192 + }, + { + "epoch": 1.8928505106778086, + "grad_norm": 1.6923696994781494, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8958438634872437, + "num_tokens": 371683001.0, + "step": 10193 + }, + { + "epoch": 1.8930362116991644, + "grad_norm": 1.6827608346939087, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8756199479103088, + "num_tokens": 371716206.0, + "step": 10194 + }, + { + "epoch": 1.89322191272052, + "grad_norm": 1.5791151523590088, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8750882148742676, + "num_tokens": 371751026.0, + "step": 10195 + }, + { + "epoch": 1.8934076137418756, + "grad_norm": 1.558119297027588, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8772307634353638, + "num_tokens": 371785201.0, + "step": 10196 + }, + { + "epoch": 1.8935933147632311, + "grad_norm": 1.3177311420440674, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8913626670837402, + "num_tokens": 371830470.0, + "step": 10197 + }, + { + "epoch": 1.8937790157845868, + "grad_norm": 1.5798046588897705, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8606597185134888, + "num_tokens": 371869728.0, + "step": 10198 + }, + { + "epoch": 1.8939647168059426, + "grad_norm": 1.5749282836914062, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8814476728439331, + "num_tokens": 371905681.0, + "step": 10199 + }, + { + "epoch": 1.894150417827298, + "grad_norm": 1.5847915410995483, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8734248876571655, + "num_tokens": 371943315.0, + "step": 10200 + }, + { + "epoch": 1.8943361188486536, + "grad_norm": 1.4844565391540527, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8886904716491699, + "num_tokens": 371979726.0, + "step": 10201 + }, + { + "epoch": 1.8945218198700093, + "grad_norm": 1.4122116565704346, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8778088092803955, + "num_tokens": 372020315.0, + "step": 10202 + }, + { + "epoch": 1.8947075208913648, + "grad_norm": 1.5747870206832886, + "learning_rate": 1e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.8961006999015808, + "num_tokens": 372052058.0, + "step": 10203 + }, + { + "epoch": 1.8948932219127204, + "grad_norm": 1.5794291496276855, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8843072652816772, + "num_tokens": 372090124.0, + "step": 10204 + }, + { + "epoch": 1.895078922934076, + "grad_norm": 1.5114997625350952, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8906741738319397, + "num_tokens": 372123199.0, + "step": 10205 + }, + { + "epoch": 1.8952646239554318, + "grad_norm": 1.6268424987792969, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8798737525939941, + "num_tokens": 372155993.0, + "step": 10206 + }, + { + "epoch": 1.8954503249767873, + "grad_norm": 1.4694523811340332, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8832062482833862, + "num_tokens": 372191003.0, + "step": 10207 + }, + { + "epoch": 1.8956360259981428, + "grad_norm": 1.5127902030944824, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8793260455131531, + "num_tokens": 372223806.0, + "step": 10208 + }, + { + "epoch": 1.8958217270194986, + "grad_norm": 1.6005228757858276, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8829978108406067, + "num_tokens": 372255366.0, + "step": 10209 + }, + { + "epoch": 1.8960074280408543, + "grad_norm": 1.483512282371521, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8875142335891724, + "num_tokens": 372290783.0, + "step": 10210 + }, + { + "epoch": 1.8961931290622098, + "grad_norm": 1.7065974473953247, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8845610618591309, + "num_tokens": 372318973.0, + "step": 10211 + }, + { + "epoch": 1.8963788300835653, + "grad_norm": 1.6722986698150635, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8787147998809814, + "num_tokens": 372348790.0, + "step": 10212 + }, + { + "epoch": 1.896564531104921, + "grad_norm": 1.5306626558303833, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8733231425285339, + "num_tokens": 372390063.0, + "step": 10213 + }, + { + "epoch": 1.8967502321262768, + "grad_norm": 1.4088410139083862, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8854256868362427, + "num_tokens": 372430235.0, + "step": 10214 + }, + { + "epoch": 1.8969359331476323, + "grad_norm": 1.7885727882385254, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8574628233909607, + "num_tokens": 372464692.0, + "step": 10215 + }, + { + "epoch": 1.8971216341689878, + "grad_norm": 1.5361504554748535, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8944519758224487, + "num_tokens": 372496315.0, + "step": 10216 + }, + { + "epoch": 1.8973073351903436, + "grad_norm": 1.555558204650879, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8827766180038452, + "num_tokens": 372530873.0, + "step": 10217 + }, + { + "epoch": 1.8974930362116993, + "grad_norm": 1.6394798755645752, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8757855892181396, + "num_tokens": 372565486.0, + "step": 10218 + }, + { + "epoch": 1.8976787372330548, + "grad_norm": 1.6131854057312012, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.887789249420166, + "num_tokens": 372601620.0, + "step": 10219 + }, + { + "epoch": 1.8978644382544103, + "grad_norm": 1.5711921453475952, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8729751110076904, + "num_tokens": 372639590.0, + "step": 10220 + }, + { + "epoch": 1.898050139275766, + "grad_norm": 1.5577532052993774, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8728792071342468, + "num_tokens": 372674821.0, + "step": 10221 + }, + { + "epoch": 1.8982358402971218, + "grad_norm": 1.5137090682983398, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8768928647041321, + "num_tokens": 372712056.0, + "step": 10222 + }, + { + "epoch": 1.8984215413184773, + "grad_norm": 1.593231439590454, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8753214478492737, + "num_tokens": 372744337.0, + "step": 10223 + }, + { + "epoch": 1.8986072423398328, + "grad_norm": 1.541730284690857, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8765681982040405, + "num_tokens": 372781842.0, + "step": 10224 + }, + { + "epoch": 1.8987929433611885, + "grad_norm": 1.5154621601104736, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8874621391296387, + "num_tokens": 372815592.0, + "step": 10225 + }, + { + "epoch": 1.8989786443825443, + "grad_norm": 1.5950967073440552, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.879666805267334, + "num_tokens": 372849293.0, + "step": 10226 + }, + { + "epoch": 1.8991643454038996, + "grad_norm": 1.3664138317108154, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.891457736492157, + "num_tokens": 372889879.0, + "step": 10227 + }, + { + "epoch": 1.8993500464252553, + "grad_norm": 1.5482268333435059, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8815653920173645, + "num_tokens": 372920889.0, + "step": 10228 + }, + { + "epoch": 1.899535747446611, + "grad_norm": 1.514342188835144, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8795042634010315, + "num_tokens": 372958214.0, + "step": 10229 + }, + { + "epoch": 1.8997214484679665, + "grad_norm": 1.596604347229004, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8831013441085815, + "num_tokens": 372992334.0, + "step": 10230 + }, + { + "epoch": 1.899907149489322, + "grad_norm": 1.5561796426773071, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8680689930915833, + "num_tokens": 373029050.0, + "step": 10231 + }, + { + "epoch": 1.9000928505106778, + "grad_norm": 1.6045143604278564, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8757402896881104, + "num_tokens": 373064691.0, + "step": 10232 + }, + { + "epoch": 1.9002785515320335, + "grad_norm": 1.5189188718795776, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.88426673412323, + "num_tokens": 373100143.0, + "step": 10233 + }, + { + "epoch": 1.900464252553389, + "grad_norm": 1.3817331790924072, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8864748477935791, + "num_tokens": 373140616.0, + "step": 10234 + }, + { + "epoch": 1.9006499535747445, + "grad_norm": 1.634389042854309, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8833719491958618, + "num_tokens": 373170005.0, + "step": 10235 + }, + { + "epoch": 1.9008356545961003, + "grad_norm": 1.6490497589111328, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8860509395599365, + "num_tokens": 373203007.0, + "step": 10236 + }, + { + "epoch": 1.901021355617456, + "grad_norm": 1.3902684450149536, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8930607438087463, + "num_tokens": 373242672.0, + "step": 10237 + }, + { + "epoch": 1.9012070566388115, + "grad_norm": 1.468309760093689, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8830945491790771, + "num_tokens": 373279374.0, + "step": 10238 + }, + { + "epoch": 1.901392757660167, + "grad_norm": 1.4202954769134521, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8817751407623291, + "num_tokens": 373318017.0, + "step": 10239 + }, + { + "epoch": 1.9015784586815228, + "grad_norm": 1.4342477321624756, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8652251958847046, + "num_tokens": 373364904.0, + "step": 10240 + }, + { + "epoch": 1.9017641597028785, + "grad_norm": 1.5248042345046997, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8736617565155029, + "num_tokens": 373403878.0, + "step": 10241 + }, + { + "epoch": 1.901949860724234, + "grad_norm": 1.6195857524871826, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8698674440383911, + "num_tokens": 373439106.0, + "step": 10242 + }, + { + "epoch": 1.9021355617455895, + "grad_norm": 1.570271611213684, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8798614740371704, + "num_tokens": 373474126.0, + "step": 10243 + }, + { + "epoch": 1.9023212627669452, + "grad_norm": 1.527112364768982, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8835548758506775, + "num_tokens": 373509671.0, + "step": 10244 + }, + { + "epoch": 1.902506963788301, + "grad_norm": 1.4041309356689453, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8825258016586304, + "num_tokens": 373550001.0, + "step": 10245 + }, + { + "epoch": 1.9026926648096565, + "grad_norm": 1.450871467590332, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8899780511856079, + "num_tokens": 373588362.0, + "step": 10246 + }, + { + "epoch": 1.902878365831012, + "grad_norm": 1.668296456336975, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8810819983482361, + "num_tokens": 373620151.0, + "step": 10247 + }, + { + "epoch": 1.9030640668523677, + "grad_norm": 1.4756489992141724, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.873181939125061, + "num_tokens": 373663337.0, + "step": 10248 + }, + { + "epoch": 1.9032497678737235, + "grad_norm": 1.773106336593628, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8771580457687378, + "num_tokens": 373692042.0, + "step": 10249 + }, + { + "epoch": 1.903435468895079, + "grad_norm": 1.475773572921753, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8626585006713867, + "num_tokens": 373736209.0, + "step": 10250 + }, + { + "epoch": 1.9036211699164345, + "grad_norm": 1.4356099367141724, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8828599452972412, + "num_tokens": 373776444.0, + "step": 10251 + }, + { + "epoch": 1.9038068709377902, + "grad_norm": 1.6139538288116455, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8779241442680359, + "num_tokens": 373813844.0, + "step": 10252 + }, + { + "epoch": 1.9039925719591457, + "grad_norm": 1.4075441360473633, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8786504864692688, + "num_tokens": 373856177.0, + "step": 10253 + }, + { + "epoch": 1.9041782729805012, + "grad_norm": 1.5203533172607422, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8844437003135681, + "num_tokens": 373894659.0, + "step": 10254 + }, + { + "epoch": 1.904363974001857, + "grad_norm": 1.6162761449813843, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8819730281829834, + "num_tokens": 373928996.0, + "step": 10255 + }, + { + "epoch": 1.9045496750232127, + "grad_norm": 1.5480148792266846, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8658740520477295, + "num_tokens": 373969845.0, + "step": 10256 + }, + { + "epoch": 1.9047353760445682, + "grad_norm": 1.5468560457229614, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8821860551834106, + "num_tokens": 374003349.0, + "step": 10257 + }, + { + "epoch": 1.9049210770659237, + "grad_norm": 1.4937206506729126, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8756217956542969, + "num_tokens": 374041872.0, + "step": 10258 + }, + { + "epoch": 1.9051067780872795, + "grad_norm": 2.8427910804748535, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8592064380645752, + "num_tokens": 374077726.0, + "step": 10259 + }, + { + "epoch": 1.9052924791086352, + "grad_norm": 1.4884998798370361, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8884438276290894, + "num_tokens": 374114234.0, + "step": 10260 + }, + { + "epoch": 1.9054781801299907, + "grad_norm": 1.4939155578613281, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8769270181655884, + "num_tokens": 374150898.0, + "step": 10261 + }, + { + "epoch": 1.9056638811513462, + "grad_norm": 1.4242647886276245, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8899093270301819, + "num_tokens": 374189299.0, + "step": 10262 + }, + { + "epoch": 1.905849582172702, + "grad_norm": 1.713201642036438, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8633073568344116, + "num_tokens": 374220525.0, + "step": 10263 + }, + { + "epoch": 1.9060352831940577, + "grad_norm": 1.5009994506835938, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8758329153060913, + "num_tokens": 374260099.0, + "step": 10264 + }, + { + "epoch": 1.9062209842154132, + "grad_norm": 1.5087591409683228, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8872045278549194, + "num_tokens": 374293699.0, + "step": 10265 + }, + { + "epoch": 1.9064066852367687, + "grad_norm": 1.5956825017929077, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8721160888671875, + "num_tokens": 374328049.0, + "step": 10266 + }, + { + "epoch": 1.9065923862581244, + "grad_norm": 1.4827476739883423, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8961489796638489, + "num_tokens": 374363211.0, + "step": 10267 + }, + { + "epoch": 1.9067780872794802, + "grad_norm": 1.56794273853302, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8806736469268799, + "num_tokens": 374399658.0, + "step": 10268 + }, + { + "epoch": 1.9069637883008357, + "grad_norm": 1.4854744672775269, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8757213354110718, + "num_tokens": 374437910.0, + "step": 10269 + }, + { + "epoch": 1.9071494893221912, + "grad_norm": 1.4922071695327759, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8769529461860657, + "num_tokens": 374475739.0, + "step": 10270 + }, + { + "epoch": 1.907335190343547, + "grad_norm": 1.480350136756897, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.873971700668335, + "num_tokens": 374517716.0, + "step": 10271 + }, + { + "epoch": 1.9075208913649027, + "grad_norm": 1.5192631483078003, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8740768432617188, + "num_tokens": 374553962.0, + "step": 10272 + }, + { + "epoch": 1.9077065923862582, + "grad_norm": 1.6099046468734741, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8873358964920044, + "num_tokens": 374584049.0, + "step": 10273 + }, + { + "epoch": 1.9078922934076137, + "grad_norm": 1.6402714252471924, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8815371990203857, + "num_tokens": 374619936.0, + "step": 10274 + }, + { + "epoch": 1.9080779944289694, + "grad_norm": 1.6259450912475586, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8659661412239075, + "num_tokens": 374654075.0, + "step": 10275 + }, + { + "epoch": 1.908263695450325, + "grad_norm": 1.4388786554336548, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8832042813301086, + "num_tokens": 374694899.0, + "step": 10276 + }, + { + "epoch": 1.9084493964716804, + "grad_norm": 1.5431909561157227, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8971472978591919, + "num_tokens": 374727291.0, + "step": 10277 + }, + { + "epoch": 1.9086350974930362, + "grad_norm": 1.6426646709442139, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8737581372261047, + "num_tokens": 374762852.0, + "step": 10278 + }, + { + "epoch": 1.908820798514392, + "grad_norm": 1.486411690711975, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8739009499549866, + "num_tokens": 374801469.0, + "step": 10279 + }, + { + "epoch": 1.9090064995357474, + "grad_norm": 1.4535269737243652, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8849848508834839, + "num_tokens": 374839577.0, + "step": 10280 + }, + { + "epoch": 1.909192200557103, + "grad_norm": 1.616937279701233, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8758906722068787, + "num_tokens": 374872361.0, + "step": 10281 + }, + { + "epoch": 1.9093779015784587, + "grad_norm": 1.594022274017334, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8752058148384094, + "num_tokens": 374905634.0, + "step": 10282 + }, + { + "epoch": 1.9095636025998144, + "grad_norm": 1.7115428447723389, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8871169090270996, + "num_tokens": 374937405.0, + "step": 10283 + }, + { + "epoch": 1.90974930362117, + "grad_norm": 1.5134690999984741, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8912594318389893, + "num_tokens": 374974370.0, + "step": 10284 + }, + { + "epoch": 1.9099350046425254, + "grad_norm": 1.667099118232727, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8551190495491028, + "num_tokens": 375008116.0, + "step": 10285 + }, + { + "epoch": 1.9101207056638811, + "grad_norm": 1.4717766046524048, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8843315839767456, + "num_tokens": 375047032.0, + "step": 10286 + }, + { + "epoch": 1.9103064066852369, + "grad_norm": 1.5050462484359741, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8935815095901489, + "num_tokens": 375082515.0, + "step": 10287 + }, + { + "epoch": 1.9104921077065924, + "grad_norm": 1.5419350862503052, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8734680414199829, + "num_tokens": 375122084.0, + "step": 10288 + }, + { + "epoch": 1.910677808727948, + "grad_norm": 1.5001640319824219, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8941382169723511, + "num_tokens": 375157134.0, + "step": 10289 + }, + { + "epoch": 1.9108635097493036, + "grad_norm": 1.4269553422927856, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8727895617485046, + "num_tokens": 375198480.0, + "step": 10290 + }, + { + "epoch": 1.9110492107706594, + "grad_norm": 1.4337785243988037, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8792011141777039, + "num_tokens": 375239654.0, + "step": 10291 + }, + { + "epoch": 1.9112349117920149, + "grad_norm": 1.5937931537628174, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.863871693611145, + "num_tokens": 375277066.0, + "step": 10292 + }, + { + "epoch": 1.9114206128133704, + "grad_norm": 1.5235782861709595, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8717706203460693, + "num_tokens": 375316181.0, + "step": 10293 + }, + { + "epoch": 1.9116063138347261, + "grad_norm": 1.620835304260254, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8828433752059937, + "num_tokens": 375347326.0, + "step": 10294 + }, + { + "epoch": 1.9117920148560819, + "grad_norm": 1.4697513580322266, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8828139305114746, + "num_tokens": 375385048.0, + "step": 10295 + }, + { + "epoch": 1.9119777158774374, + "grad_norm": 1.5814580917358398, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8826114535331726, + "num_tokens": 375417521.0, + "step": 10296 + }, + { + "epoch": 1.9121634168987929, + "grad_norm": 1.3835145235061646, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8979424238204956, + "num_tokens": 375458697.0, + "step": 10297 + }, + { + "epoch": 1.9123491179201486, + "grad_norm": 1.542822003364563, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8678897619247437, + "num_tokens": 375494818.0, + "step": 10298 + }, + { + "epoch": 1.9125348189415043, + "grad_norm": 1.491740107536316, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8878252506256104, + "num_tokens": 375531107.0, + "step": 10299 + }, + { + "epoch": 1.9127205199628596, + "grad_norm": 1.4715368747711182, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8663249611854553, + "num_tokens": 375574326.0, + "step": 10300 + }, + { + "epoch": 1.9129062209842154, + "grad_norm": 1.5046920776367188, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8798545002937317, + "num_tokens": 375610000.0, + "step": 10301 + }, + { + "epoch": 1.913091922005571, + "grad_norm": 1.6072067022323608, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8963152766227722, + "num_tokens": 375643993.0, + "step": 10302 + }, + { + "epoch": 1.9132776230269266, + "grad_norm": 1.4975093603134155, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8801097273826599, + "num_tokens": 375679723.0, + "step": 10303 + }, + { + "epoch": 1.9134633240482821, + "grad_norm": 1.6615142822265625, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8680827617645264, + "num_tokens": 375713665.0, + "step": 10304 + }, + { + "epoch": 1.9136490250696379, + "grad_norm": 1.526194453239441, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8891167640686035, + "num_tokens": 375746873.0, + "step": 10305 + }, + { + "epoch": 1.9138347260909936, + "grad_norm": 1.483389139175415, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8746572732925415, + "num_tokens": 375782691.0, + "step": 10306 + }, + { + "epoch": 1.914020427112349, + "grad_norm": 1.5716965198516846, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8858020901679993, + "num_tokens": 375816302.0, + "step": 10307 + }, + { + "epoch": 1.9142061281337046, + "grad_norm": 1.5322213172912598, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8819460868835449, + "num_tokens": 375853974.0, + "step": 10308 + }, + { + "epoch": 1.9143918291550603, + "grad_norm": 1.488716721534729, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8882151246070862, + "num_tokens": 375889110.0, + "step": 10309 + }, + { + "epoch": 1.914577530176416, + "grad_norm": 1.543052315711975, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8724788427352905, + "num_tokens": 375923032.0, + "step": 10310 + }, + { + "epoch": 1.9147632311977716, + "grad_norm": 1.496994137763977, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8893527984619141, + "num_tokens": 375958588.0, + "step": 10311 + }, + { + "epoch": 1.914948932219127, + "grad_norm": 1.5427242517471313, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8752120137214661, + "num_tokens": 375996784.0, + "step": 10312 + }, + { + "epoch": 1.9151346332404828, + "grad_norm": 1.4164146184921265, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8704618215560913, + "num_tokens": 376038736.0, + "step": 10313 + }, + { + "epoch": 1.9153203342618386, + "grad_norm": 1.4805487394332886, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.877732515335083, + "num_tokens": 376077421.0, + "step": 10314 + }, + { + "epoch": 1.915506035283194, + "grad_norm": 1.495666742324829, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8934344053268433, + "num_tokens": 376112711.0, + "step": 10315 + }, + { + "epoch": 1.9156917363045496, + "grad_norm": 1.5705718994140625, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8793426752090454, + "num_tokens": 376144699.0, + "step": 10316 + }, + { + "epoch": 1.9158774373259053, + "grad_norm": 1.4966599941253662, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.874633252620697, + "num_tokens": 376182929.0, + "step": 10317 + }, + { + "epoch": 1.916063138347261, + "grad_norm": 1.6460989713668823, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8789179921150208, + "num_tokens": 376213769.0, + "step": 10318 + }, + { + "epoch": 1.9162488393686166, + "grad_norm": 1.4620054960250854, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8897605538368225, + "num_tokens": 376247787.0, + "step": 10319 + }, + { + "epoch": 1.916434540389972, + "grad_norm": 1.53085458278656, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8957591652870178, + "num_tokens": 376281452.0, + "step": 10320 + }, + { + "epoch": 1.9166202414113278, + "grad_norm": 1.5544605255126953, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8880270719528198, + "num_tokens": 376313249.0, + "step": 10321 + }, + { + "epoch": 1.9168059424326835, + "grad_norm": 1.441707730293274, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8840827345848083, + "num_tokens": 376352857.0, + "step": 10322 + }, + { + "epoch": 1.916991643454039, + "grad_norm": 1.4261468648910522, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8941987752914429, + "num_tokens": 376389476.0, + "step": 10323 + }, + { + "epoch": 1.9171773444753946, + "grad_norm": 1.5873658657073975, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.889344334602356, + "num_tokens": 376425397.0, + "step": 10324 + }, + { + "epoch": 1.9173630454967503, + "grad_norm": 1.5178133249282837, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8845842480659485, + "num_tokens": 376462294.0, + "step": 10325 + }, + { + "epoch": 1.9175487465181058, + "grad_norm": 1.5162376165390015, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8810919523239136, + "num_tokens": 376500348.0, + "step": 10326 + }, + { + "epoch": 1.9177344475394613, + "grad_norm": 1.4984338283538818, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8955767154693604, + "num_tokens": 376535318.0, + "step": 10327 + }, + { + "epoch": 1.917920148560817, + "grad_norm": 1.5634794235229492, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8850004076957703, + "num_tokens": 376568438.0, + "step": 10328 + }, + { + "epoch": 1.9181058495821728, + "grad_norm": 1.4536346197128296, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8949000239372253, + "num_tokens": 376606942.0, + "step": 10329 + }, + { + "epoch": 1.9182915506035283, + "grad_norm": 1.666259765625, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8760315179824829, + "num_tokens": 376643414.0, + "step": 10330 + }, + { + "epoch": 1.9184772516248838, + "grad_norm": 1.370998740196228, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8819863796234131, + "num_tokens": 376684890.0, + "step": 10331 + }, + { + "epoch": 1.9186629526462395, + "grad_norm": 1.5039721727371216, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8787354230880737, + "num_tokens": 376720248.0, + "step": 10332 + }, + { + "epoch": 1.9188486536675953, + "grad_norm": 1.497624158859253, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8808044791221619, + "num_tokens": 376758828.0, + "step": 10333 + }, + { + "epoch": 1.9190343546889508, + "grad_norm": 1.4951344728469849, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8732285499572754, + "num_tokens": 376796944.0, + "step": 10334 + }, + { + "epoch": 1.9192200557103063, + "grad_norm": 1.536805510520935, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8907696604728699, + "num_tokens": 376827604.0, + "step": 10335 + }, + { + "epoch": 1.919405756731662, + "grad_norm": 1.607419490814209, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.873650848865509, + "num_tokens": 376863307.0, + "step": 10336 + }, + { + "epoch": 1.9195914577530178, + "grad_norm": 1.4693589210510254, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8845566511154175, + "num_tokens": 376899344.0, + "step": 10337 + }, + { + "epoch": 1.9197771587743733, + "grad_norm": 1.518791675567627, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8878825902938843, + "num_tokens": 376933606.0, + "step": 10338 + }, + { + "epoch": 1.9199628597957288, + "grad_norm": 1.571743130683899, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8767166137695312, + "num_tokens": 376968093.0, + "step": 10339 + }, + { + "epoch": 1.9201485608170845, + "grad_norm": 1.575143814086914, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8938841223716736, + "num_tokens": 376997474.0, + "step": 10340 + }, + { + "epoch": 1.9203342618384402, + "grad_norm": 1.5655673742294312, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8837270736694336, + "num_tokens": 377028833.0, + "step": 10341 + }, + { + "epoch": 1.9205199628597958, + "grad_norm": 1.4905154705047607, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8835349082946777, + "num_tokens": 377061801.0, + "step": 10342 + }, + { + "epoch": 1.9207056638811513, + "grad_norm": 1.5321506261825562, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8768237829208374, + "num_tokens": 377098687.0, + "step": 10343 + }, + { + "epoch": 1.920891364902507, + "grad_norm": 1.6063671112060547, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8639552593231201, + "num_tokens": 377133216.0, + "step": 10344 + }, + { + "epoch": 1.9210770659238627, + "grad_norm": 1.5598053932189941, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8762549161911011, + "num_tokens": 377170394.0, + "step": 10345 + }, + { + "epoch": 1.9212627669452182, + "grad_norm": 1.799959421157837, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8712559342384338, + "num_tokens": 377200246.0, + "step": 10346 + }, + { + "epoch": 1.9214484679665738, + "grad_norm": 1.410531759262085, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8923051953315735, + "num_tokens": 377237365.0, + "step": 10347 + }, + { + "epoch": 1.9216341689879295, + "grad_norm": 1.490921974182129, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8625948429107666, + "num_tokens": 377276489.0, + "step": 10348 + }, + { + "epoch": 1.921819870009285, + "grad_norm": 1.614101529121399, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8678685426712036, + "num_tokens": 377310971.0, + "step": 10349 + }, + { + "epoch": 1.9220055710306405, + "grad_norm": 1.7632553577423096, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8616023063659668, + "num_tokens": 377340981.0, + "step": 10350 + }, + { + "epoch": 1.9221912720519962, + "grad_norm": 1.5095973014831543, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8905012607574463, + "num_tokens": 377380432.0, + "step": 10351 + }, + { + "epoch": 1.922376973073352, + "grad_norm": 1.5999783277511597, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8760280013084412, + "num_tokens": 377416943.0, + "step": 10352 + }, + { + "epoch": 1.9225626740947075, + "grad_norm": 1.5190761089324951, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8725755214691162, + "num_tokens": 377453315.0, + "step": 10353 + }, + { + "epoch": 1.922748375116063, + "grad_norm": 1.5023130178451538, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8691493272781372, + "num_tokens": 377493058.0, + "step": 10354 + }, + { + "epoch": 1.9229340761374187, + "grad_norm": 1.6588616371154785, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8757990598678589, + "num_tokens": 377525028.0, + "step": 10355 + }, + { + "epoch": 1.9231197771587745, + "grad_norm": 1.533611536026001, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8772466778755188, + "num_tokens": 377562966.0, + "step": 10356 + }, + { + "epoch": 1.92330547818013, + "grad_norm": 1.5231088399887085, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8917005658149719, + "num_tokens": 377602290.0, + "step": 10357 + }, + { + "epoch": 1.9234911792014855, + "grad_norm": 1.5862573385238647, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8753196001052856, + "num_tokens": 377636363.0, + "step": 10358 + }, + { + "epoch": 1.9236768802228412, + "grad_norm": 1.600512146949768, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8755706548690796, + "num_tokens": 377670204.0, + "step": 10359 + }, + { + "epoch": 1.923862581244197, + "grad_norm": 1.551135778427124, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8814395666122437, + "num_tokens": 377704742.0, + "step": 10360 + }, + { + "epoch": 1.9240482822655525, + "grad_norm": 1.4699827432632446, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8714510202407837, + "num_tokens": 377742341.0, + "step": 10361 + }, + { + "epoch": 1.924233983286908, + "grad_norm": 1.4933511018753052, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8643456697463989, + "num_tokens": 377788948.0, + "step": 10362 + }, + { + "epoch": 1.9244196843082637, + "grad_norm": 1.5363658666610718, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8725042939186096, + "num_tokens": 377826735.0, + "step": 10363 + }, + { + "epoch": 1.9246053853296194, + "grad_norm": 1.6449310779571533, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8781302571296692, + "num_tokens": 377862071.0, + "step": 10364 + }, + { + "epoch": 1.924791086350975, + "grad_norm": 1.499988317489624, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8700437545776367, + "num_tokens": 377905237.0, + "step": 10365 + }, + { + "epoch": 1.9249767873723305, + "grad_norm": 1.633529543876648, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.857239305973053, + "num_tokens": 377939763.0, + "step": 10366 + }, + { + "epoch": 1.9251624883936862, + "grad_norm": 1.4731072187423706, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8760218620300293, + "num_tokens": 377975606.0, + "step": 10367 + }, + { + "epoch": 1.925348189415042, + "grad_norm": 1.5355576276779175, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8886661529541016, + "num_tokens": 378009064.0, + "step": 10368 + }, + { + "epoch": 1.9255338904363974, + "grad_norm": 1.4658126831054688, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8757343888282776, + "num_tokens": 378047232.0, + "step": 10369 + }, + { + "epoch": 1.925719591457753, + "grad_norm": 1.6098833084106445, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.87880939245224, + "num_tokens": 378083823.0, + "step": 10370 + }, + { + "epoch": 1.9259052924791087, + "grad_norm": 1.6177030801773071, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8869975209236145, + "num_tokens": 378124439.0, + "step": 10371 + }, + { + "epoch": 1.9260909935004642, + "grad_norm": 1.5786986351013184, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8826629519462585, + "num_tokens": 378160751.0, + "step": 10372 + }, + { + "epoch": 1.9262766945218197, + "grad_norm": 1.6216087341308594, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8887914419174194, + "num_tokens": 378191310.0, + "step": 10373 + }, + { + "epoch": 1.9264623955431754, + "grad_norm": 1.4768903255462646, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8785420656204224, + "num_tokens": 378228402.0, + "step": 10374 + }, + { + "epoch": 1.9266480965645312, + "grad_norm": 1.6656668186187744, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8994811773300171, + "num_tokens": 378255043.0, + "step": 10375 + }, + { + "epoch": 1.9268337975858867, + "grad_norm": 1.5711909532546997, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8820173740386963, + "num_tokens": 378289346.0, + "step": 10376 + }, + { + "epoch": 1.9270194986072422, + "grad_norm": 1.495505928993225, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.887239933013916, + "num_tokens": 378327845.0, + "step": 10377 + }, + { + "epoch": 1.927205199628598, + "grad_norm": 1.5239689350128174, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8859943151473999, + "num_tokens": 378364544.0, + "step": 10378 + }, + { + "epoch": 1.9273909006499537, + "grad_norm": 1.4875394105911255, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8910340070724487, + "num_tokens": 378400026.0, + "step": 10379 + }, + { + "epoch": 1.9275766016713092, + "grad_norm": 1.5423787832260132, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8788039684295654, + "num_tokens": 378438868.0, + "step": 10380 + }, + { + "epoch": 1.9277623026926647, + "grad_norm": 1.831913948059082, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8815176486968994, + "num_tokens": 378464169.0, + "step": 10381 + }, + { + "epoch": 1.9279480037140204, + "grad_norm": 1.591960072517395, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8845909833908081, + "num_tokens": 378497546.0, + "step": 10382 + }, + { + "epoch": 1.9281337047353762, + "grad_norm": 1.3995217084884644, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.895067036151886, + "num_tokens": 378535100.0, + "step": 10383 + }, + { + "epoch": 1.9283194057567317, + "grad_norm": 1.7794586420059204, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8607698082923889, + "num_tokens": 378568494.0, + "step": 10384 + }, + { + "epoch": 1.9285051067780872, + "grad_norm": 1.5160877704620361, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8713237643241882, + "num_tokens": 378603379.0, + "step": 10385 + }, + { + "epoch": 1.928690807799443, + "grad_norm": 1.5534603595733643, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8719255924224854, + "num_tokens": 378638393.0, + "step": 10386 + }, + { + "epoch": 1.9288765088207986, + "grad_norm": 1.4426366090774536, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8877973556518555, + "num_tokens": 378674638.0, + "step": 10387 + }, + { + "epoch": 1.9290622098421542, + "grad_norm": 1.7020926475524902, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8683972358703613, + "num_tokens": 378707659.0, + "step": 10388 + }, + { + "epoch": 1.9292479108635097, + "grad_norm": 1.5165425539016724, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.880504846572876, + "num_tokens": 378745802.0, + "step": 10389 + }, + { + "epoch": 1.9294336118848654, + "grad_norm": 1.4659126996994019, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8645231127738953, + "num_tokens": 378784383.0, + "step": 10390 + }, + { + "epoch": 1.9296193129062211, + "grad_norm": 1.47607421875, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8918553590774536, + "num_tokens": 378820678.0, + "step": 10391 + }, + { + "epoch": 1.9298050139275766, + "grad_norm": 1.3674960136413574, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8802881240844727, + "num_tokens": 378864476.0, + "step": 10392 + }, + { + "epoch": 1.9299907149489322, + "grad_norm": 1.4702129364013672, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8813862800598145, + "num_tokens": 378901640.0, + "step": 10393 + }, + { + "epoch": 1.9301764159702879, + "grad_norm": 1.520365834236145, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8654519319534302, + "num_tokens": 378938611.0, + "step": 10394 + }, + { + "epoch": 1.9303621169916436, + "grad_norm": 1.512954831123352, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8723196983337402, + "num_tokens": 378977534.0, + "step": 10395 + }, + { + "epoch": 1.930547818012999, + "grad_norm": 1.6949896812438965, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8642270565032959, + "num_tokens": 379010025.0, + "step": 10396 + }, + { + "epoch": 1.9307335190343546, + "grad_norm": 1.541538953781128, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8850805163383484, + "num_tokens": 379044899.0, + "step": 10397 + }, + { + "epoch": 1.9309192200557104, + "grad_norm": 1.603859305381775, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8879330158233643, + "num_tokens": 379075927.0, + "step": 10398 + }, + { + "epoch": 1.9311049210770659, + "grad_norm": 1.385979175567627, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8872907757759094, + "num_tokens": 379114691.0, + "step": 10399 + }, + { + "epoch": 1.9312906220984214, + "grad_norm": 1.658049464225769, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8739616274833679, + "num_tokens": 379147875.0, + "step": 10400 + }, + { + "epoch": 1.9314763231197771, + "grad_norm": 1.5565294027328491, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8712411522865295, + "num_tokens": 379184767.0, + "step": 10401 + }, + { + "epoch": 1.9316620241411329, + "grad_norm": 1.5342433452606201, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8778242468833923, + "num_tokens": 379220537.0, + "step": 10402 + }, + { + "epoch": 1.9318477251624884, + "grad_norm": 1.6031508445739746, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.885982096195221, + "num_tokens": 379256660.0, + "step": 10403 + }, + { + "epoch": 1.9320334261838439, + "grad_norm": 1.4798716306686401, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8651288151741028, + "num_tokens": 379301027.0, + "step": 10404 + }, + { + "epoch": 1.9322191272051996, + "grad_norm": 1.6187255382537842, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8707997798919678, + "num_tokens": 379334838.0, + "step": 10405 + }, + { + "epoch": 1.9324048282265553, + "grad_norm": 1.6265435218811035, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8511961102485657, + "num_tokens": 379371045.0, + "step": 10406 + }, + { + "epoch": 1.9325905292479109, + "grad_norm": 1.6744788885116577, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8748123645782471, + "num_tokens": 379406617.0, + "step": 10407 + }, + { + "epoch": 1.9327762302692664, + "grad_norm": 1.7235203981399536, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8641014099121094, + "num_tokens": 379438275.0, + "step": 10408 + }, + { + "epoch": 1.932961931290622, + "grad_norm": 1.5992108583450317, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.880791425704956, + "num_tokens": 379473798.0, + "step": 10409 + }, + { + "epoch": 1.9331476323119778, + "grad_norm": 1.4232842922210693, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.880222499370575, + "num_tokens": 379514644.0, + "step": 10410 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 1.4731603860855103, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8767153024673462, + "num_tokens": 379552943.0, + "step": 10411 + }, + { + "epoch": 1.9335190343546889, + "grad_norm": 1.677851915359497, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8802598714828491, + "num_tokens": 379587929.0, + "step": 10412 + }, + { + "epoch": 1.9337047353760446, + "grad_norm": 1.4270695447921753, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8801777362823486, + "num_tokens": 379625540.0, + "step": 10413 + }, + { + "epoch": 1.9338904363974003, + "grad_norm": 1.6320620775222778, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8794512748718262, + "num_tokens": 379660626.0, + "step": 10414 + }, + { + "epoch": 1.9340761374187558, + "grad_norm": 1.5297555923461914, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8823671936988831, + "num_tokens": 379695735.0, + "step": 10415 + }, + { + "epoch": 1.9342618384401113, + "grad_norm": 1.5706673860549927, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8803518414497375, + "num_tokens": 379730980.0, + "step": 10416 + }, + { + "epoch": 1.934447539461467, + "grad_norm": 1.608830451965332, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8770856857299805, + "num_tokens": 379764048.0, + "step": 10417 + }, + { + "epoch": 1.9346332404828228, + "grad_norm": 1.4780977964401245, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8679268956184387, + "num_tokens": 379801929.0, + "step": 10418 + }, + { + "epoch": 1.9348189415041783, + "grad_norm": 1.5918992757797241, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8756262063980103, + "num_tokens": 379835442.0, + "step": 10419 + }, + { + "epoch": 1.9350046425255338, + "grad_norm": 1.481471061706543, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8869181275367737, + "num_tokens": 379870942.0, + "step": 10420 + }, + { + "epoch": 1.9351903435468896, + "grad_norm": 1.4440820217132568, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8873701095581055, + "num_tokens": 379907391.0, + "step": 10421 + }, + { + "epoch": 1.935376044568245, + "grad_norm": 1.5377520322799683, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8801604509353638, + "num_tokens": 379944440.0, + "step": 10422 + }, + { + "epoch": 1.9355617455896006, + "grad_norm": 1.6409932374954224, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.864924430847168, + "num_tokens": 379980362.0, + "step": 10423 + }, + { + "epoch": 1.9357474466109563, + "grad_norm": 1.515080213546753, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8824365139007568, + "num_tokens": 380015405.0, + "step": 10424 + }, + { + "epoch": 1.935933147632312, + "grad_norm": 1.6405810117721558, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8779197931289673, + "num_tokens": 380051652.0, + "step": 10425 + }, + { + "epoch": 1.9361188486536676, + "grad_norm": 1.5793483257293701, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8855422735214233, + "num_tokens": 380086341.0, + "step": 10426 + }, + { + "epoch": 1.936304549675023, + "grad_norm": 1.515883207321167, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.879267692565918, + "num_tokens": 380124461.0, + "step": 10427 + }, + { + "epoch": 1.9364902506963788, + "grad_norm": 1.571853756904602, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8575794696807861, + "num_tokens": 380161905.0, + "step": 10428 + }, + { + "epoch": 1.9366759517177345, + "grad_norm": 1.3769912719726562, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8880627751350403, + "num_tokens": 380205192.0, + "step": 10429 + }, + { + "epoch": 1.93686165273909, + "grad_norm": 1.6473692655563354, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8716863393783569, + "num_tokens": 380240211.0, + "step": 10430 + }, + { + "epoch": 1.9370473537604456, + "grad_norm": 1.4848322868347168, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8739296197891235, + "num_tokens": 380279457.0, + "step": 10431 + }, + { + "epoch": 1.9372330547818013, + "grad_norm": 1.4781659841537476, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8779860138893127, + "num_tokens": 380314774.0, + "step": 10432 + }, + { + "epoch": 1.937418755803157, + "grad_norm": 1.5510743856430054, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8720785975456238, + "num_tokens": 380351783.0, + "step": 10433 + }, + { + "epoch": 1.9376044568245125, + "grad_norm": 1.5051732063293457, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8853458762168884, + "num_tokens": 380387796.0, + "step": 10434 + }, + { + "epoch": 1.937790157845868, + "grad_norm": 1.5073548555374146, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8867336511611938, + "num_tokens": 380422128.0, + "step": 10435 + }, + { + "epoch": 1.9379758588672238, + "grad_norm": 1.4251508712768555, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8765164017677307, + "num_tokens": 380460381.0, + "step": 10436 + }, + { + "epoch": 1.9381615598885795, + "grad_norm": 1.491707682609558, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8794300556182861, + "num_tokens": 380498725.0, + "step": 10437 + }, + { + "epoch": 1.938347260909935, + "grad_norm": 1.6028556823730469, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8848514556884766, + "num_tokens": 380532342.0, + "step": 10438 + }, + { + "epoch": 1.9385329619312905, + "grad_norm": 1.4452470541000366, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8834749460220337, + "num_tokens": 380573356.0, + "step": 10439 + }, + { + "epoch": 1.9387186629526463, + "grad_norm": 1.468973159790039, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8849382400512695, + "num_tokens": 380610304.0, + "step": 10440 + }, + { + "epoch": 1.938904363974002, + "grad_norm": 1.4469554424285889, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8932981491088867, + "num_tokens": 380643866.0, + "step": 10441 + }, + { + "epoch": 1.9390900649953575, + "grad_norm": 1.5282998085021973, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.887575626373291, + "num_tokens": 380682191.0, + "step": 10442 + }, + { + "epoch": 1.939275766016713, + "grad_norm": 1.4978734254837036, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8822380900382996, + "num_tokens": 380718676.0, + "step": 10443 + }, + { + "epoch": 1.9394614670380688, + "grad_norm": 1.593923807144165, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8829423189163208, + "num_tokens": 380754633.0, + "step": 10444 + }, + { + "epoch": 1.9396471680594243, + "grad_norm": 1.5632481575012207, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8829538822174072, + "num_tokens": 380789659.0, + "step": 10445 + }, + { + "epoch": 1.9398328690807798, + "grad_norm": 1.4214353561401367, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.887195348739624, + "num_tokens": 380828132.0, + "step": 10446 + }, + { + "epoch": 1.9400185701021355, + "grad_norm": 1.533820629119873, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8836723566055298, + "num_tokens": 380862958.0, + "step": 10447 + }, + { + "epoch": 1.9402042711234913, + "grad_norm": 1.5853487253189087, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8812763690948486, + "num_tokens": 380895547.0, + "step": 10448 + }, + { + "epoch": 1.9403899721448468, + "grad_norm": 1.6349724531173706, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8784502744674683, + "num_tokens": 380926127.0, + "step": 10449 + }, + { + "epoch": 1.9405756731662023, + "grad_norm": 1.5127663612365723, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8821803331375122, + "num_tokens": 380962805.0, + "step": 10450 + }, + { + "epoch": 1.940761374187558, + "grad_norm": 1.5200597047805786, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8749532699584961, + "num_tokens": 381001471.0, + "step": 10451 + }, + { + "epoch": 1.9409470752089137, + "grad_norm": 1.4435274600982666, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8717383146286011, + "num_tokens": 381039990.0, + "step": 10452 + }, + { + "epoch": 1.9411327762302693, + "grad_norm": 1.6250860691070557, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8682668209075928, + "num_tokens": 381074466.0, + "step": 10453 + }, + { + "epoch": 1.9413184772516248, + "grad_norm": 1.6244338750839233, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8733186721801758, + "num_tokens": 381111446.0, + "step": 10454 + }, + { + "epoch": 1.9415041782729805, + "grad_norm": 1.4429311752319336, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8911546468734741, + "num_tokens": 381146969.0, + "step": 10455 + }, + { + "epoch": 1.9416898792943362, + "grad_norm": 1.5369032621383667, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8795697093009949, + "num_tokens": 381183665.0, + "step": 10456 + }, + { + "epoch": 1.9418755803156917, + "grad_norm": 1.5782685279846191, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.871422529220581, + "num_tokens": 381223649.0, + "step": 10457 + }, + { + "epoch": 1.9420612813370473, + "grad_norm": 1.451326608657837, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.878745436668396, + "num_tokens": 381259573.0, + "step": 10458 + }, + { + "epoch": 1.942246982358403, + "grad_norm": 1.607060194015503, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8689029216766357, + "num_tokens": 381293602.0, + "step": 10459 + }, + { + "epoch": 1.9424326833797587, + "grad_norm": 1.478345274925232, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8842630982398987, + "num_tokens": 381327884.0, + "step": 10460 + }, + { + "epoch": 1.9426183844011142, + "grad_norm": 1.4155797958374023, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8800152540206909, + "num_tokens": 381367156.0, + "step": 10461 + }, + { + "epoch": 1.9428040854224697, + "grad_norm": 1.485049843788147, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8950845003128052, + "num_tokens": 381405172.0, + "step": 10462 + }, + { + "epoch": 1.9429897864438255, + "grad_norm": 1.5149891376495361, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8825300931930542, + "num_tokens": 381439267.0, + "step": 10463 + }, + { + "epoch": 1.9431754874651812, + "grad_norm": 1.5280091762542725, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8892530798912048, + "num_tokens": 381469317.0, + "step": 10464 + }, + { + "epoch": 1.9433611884865367, + "grad_norm": 1.4855960607528687, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8868821859359741, + "num_tokens": 381504157.0, + "step": 10465 + }, + { + "epoch": 1.9435468895078922, + "grad_norm": 1.5039392709732056, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8854357004165649, + "num_tokens": 381538050.0, + "step": 10466 + }, + { + "epoch": 1.943732590529248, + "grad_norm": 1.5403079986572266, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8897503614425659, + "num_tokens": 381571446.0, + "step": 10467 + }, + { + "epoch": 1.9439182915506037, + "grad_norm": 1.4816949367523193, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8748859167098999, + "num_tokens": 381609336.0, + "step": 10468 + }, + { + "epoch": 1.944103992571959, + "grad_norm": 1.6017612218856812, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8863017559051514, + "num_tokens": 381639908.0, + "step": 10469 + }, + { + "epoch": 1.9442896935933147, + "grad_norm": 1.715968370437622, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8705064058303833, + "num_tokens": 381673578.0, + "step": 10470 + }, + { + "epoch": 1.9444753946146704, + "grad_norm": 1.4924296140670776, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8931161165237427, + "num_tokens": 381709229.0, + "step": 10471 + }, + { + "epoch": 1.944661095636026, + "grad_norm": 1.5029120445251465, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8889490365982056, + "num_tokens": 381746627.0, + "step": 10472 + }, + { + "epoch": 1.9448467966573815, + "grad_norm": 1.5707988739013672, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8747391700744629, + "num_tokens": 381785377.0, + "step": 10473 + }, + { + "epoch": 1.9450324976787372, + "grad_norm": 1.6012461185455322, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8789352178573608, + "num_tokens": 381819818.0, + "step": 10474 + }, + { + "epoch": 1.945218198700093, + "grad_norm": 1.6943798065185547, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8812823295593262, + "num_tokens": 381849291.0, + "step": 10475 + }, + { + "epoch": 1.9454038997214484, + "grad_norm": 1.6199984550476074, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8748500943183899, + "num_tokens": 381882684.0, + "step": 10476 + }, + { + "epoch": 1.945589600742804, + "grad_norm": 1.5816816091537476, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8901770114898682, + "num_tokens": 381916336.0, + "step": 10477 + }, + { + "epoch": 1.9457753017641597, + "grad_norm": 1.4815258979797363, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8927255868911743, + "num_tokens": 381950960.0, + "step": 10478 + }, + { + "epoch": 1.9459610027855154, + "grad_norm": 1.6608690023422241, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8741705417633057, + "num_tokens": 381986974.0, + "step": 10479 + }, + { + "epoch": 1.946146703806871, + "grad_norm": 1.6871798038482666, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8777116537094116, + "num_tokens": 382020568.0, + "step": 10480 + }, + { + "epoch": 1.9463324048282264, + "grad_norm": 1.5081413984298706, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8929500579833984, + "num_tokens": 382054638.0, + "step": 10481 + }, + { + "epoch": 1.9465181058495822, + "grad_norm": 1.4493123292922974, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8757309317588806, + "num_tokens": 382092557.0, + "step": 10482 + }, + { + "epoch": 1.946703806870938, + "grad_norm": 1.4812655448913574, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8855311274528503, + "num_tokens": 382129081.0, + "step": 10483 + }, + { + "epoch": 1.9468895078922934, + "grad_norm": 1.4735594987869263, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8809162974357605, + "num_tokens": 382164457.0, + "step": 10484 + }, + { + "epoch": 1.947075208913649, + "grad_norm": 1.5292547941207886, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8786924481391907, + "num_tokens": 382202488.0, + "step": 10485 + }, + { + "epoch": 1.9472609099350047, + "grad_norm": 1.475258469581604, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8842487335205078, + "num_tokens": 382239346.0, + "step": 10486 + }, + { + "epoch": 1.9474466109563604, + "grad_norm": 1.5482507944107056, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8888186812400818, + "num_tokens": 382271536.0, + "step": 10487 + }, + { + "epoch": 1.947632311977716, + "grad_norm": 1.5436094999313354, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8865648508071899, + "num_tokens": 382304903.0, + "step": 10488 + }, + { + "epoch": 1.9478180129990714, + "grad_norm": 1.6733318567276, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8719900250434875, + "num_tokens": 382334933.0, + "step": 10489 + }, + { + "epoch": 1.9480037140204272, + "grad_norm": 1.6176989078521729, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.875937283039093, + "num_tokens": 382369132.0, + "step": 10490 + }, + { + "epoch": 1.948189415041783, + "grad_norm": 1.4574198722839355, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8639218807220459, + "num_tokens": 382414128.0, + "step": 10491 + }, + { + "epoch": 1.9483751160631384, + "grad_norm": 1.5317734479904175, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8796629905700684, + "num_tokens": 382451022.0, + "step": 10492 + }, + { + "epoch": 1.948560817084494, + "grad_norm": 1.5828763246536255, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8772070407867432, + "num_tokens": 382486141.0, + "step": 10493 + }, + { + "epoch": 1.9487465181058496, + "grad_norm": 1.6057711839675903, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8868244886398315, + "num_tokens": 382519039.0, + "step": 10494 + }, + { + "epoch": 1.9489322191272052, + "grad_norm": 1.3925093412399292, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8725399374961853, + "num_tokens": 382565770.0, + "step": 10495 + }, + { + "epoch": 1.9491179201485607, + "grad_norm": 1.5189049243927002, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8619809746742249, + "num_tokens": 382604907.0, + "step": 10496 + }, + { + "epoch": 1.9493036211699164, + "grad_norm": 1.5962098836898804, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8788984417915344, + "num_tokens": 382638893.0, + "step": 10497 + }, + { + "epoch": 1.9494893221912721, + "grad_norm": 1.6121351718902588, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8717079758644104, + "num_tokens": 382674022.0, + "step": 10498 + }, + { + "epoch": 1.9496750232126276, + "grad_norm": 1.491997241973877, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8772848844528198, + "num_tokens": 382711860.0, + "step": 10499 + }, + { + "epoch": 1.9498607242339832, + "grad_norm": 1.4688785076141357, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8901430368423462, + "num_tokens": 382750304.0, + "step": 10500 + }, + { + "epoch": 1.9500464252553389, + "grad_norm": 1.456006646156311, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8674595355987549, + "num_tokens": 382791371.0, + "step": 10501 + }, + { + "epoch": 1.9502321262766946, + "grad_norm": 1.44889497756958, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8755130767822266, + "num_tokens": 382832525.0, + "step": 10502 + }, + { + "epoch": 1.9504178272980501, + "grad_norm": 1.414182186126709, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8785094022750854, + "num_tokens": 382876046.0, + "step": 10503 + }, + { + "epoch": 1.9506035283194056, + "grad_norm": 1.530029058456421, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8798882961273193, + "num_tokens": 382911251.0, + "step": 10504 + }, + { + "epoch": 1.9507892293407614, + "grad_norm": 1.555302619934082, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8644304275512695, + "num_tokens": 382947396.0, + "step": 10505 + }, + { + "epoch": 1.950974930362117, + "grad_norm": 1.683470368385315, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8698314428329468, + "num_tokens": 382979020.0, + "step": 10506 + }, + { + "epoch": 1.9511606313834726, + "grad_norm": 1.4774103164672852, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8759709596633911, + "num_tokens": 383017900.0, + "step": 10507 + }, + { + "epoch": 1.9513463324048281, + "grad_norm": 1.4824740886688232, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8799110651016235, + "num_tokens": 383057102.0, + "step": 10508 + }, + { + "epoch": 1.9515320334261839, + "grad_norm": 1.5130106210708618, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8753253221511841, + "num_tokens": 383095294.0, + "step": 10509 + }, + { + "epoch": 1.9517177344475396, + "grad_norm": 1.5094817876815796, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8766871690750122, + "num_tokens": 383136066.0, + "step": 10510 + }, + { + "epoch": 1.951903435468895, + "grad_norm": 1.5069752931594849, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8779612183570862, + "num_tokens": 383170614.0, + "step": 10511 + }, + { + "epoch": 1.9520891364902506, + "grad_norm": 1.4798614978790283, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8751376867294312, + "num_tokens": 383211631.0, + "step": 10512 + }, + { + "epoch": 1.9522748375116064, + "grad_norm": 1.5958093404769897, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8646193146705627, + "num_tokens": 383250143.0, + "step": 10513 + }, + { + "epoch": 1.952460538532962, + "grad_norm": 1.6099292039871216, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8643193244934082, + "num_tokens": 383286347.0, + "step": 10514 + }, + { + "epoch": 1.9526462395543176, + "grad_norm": 1.560817837715149, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8902252912521362, + "num_tokens": 383320504.0, + "step": 10515 + }, + { + "epoch": 1.952831940575673, + "grad_norm": 1.4639664888381958, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8706790208816528, + "num_tokens": 383358927.0, + "step": 10516 + }, + { + "epoch": 1.9530176415970288, + "grad_norm": 1.408103346824646, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8792515397071838, + "num_tokens": 383397901.0, + "step": 10517 + }, + { + "epoch": 1.9532033426183844, + "grad_norm": 1.5074431896209717, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8887444734573364, + "num_tokens": 383432834.0, + "step": 10518 + }, + { + "epoch": 1.9533890436397399, + "grad_norm": 1.47520911693573, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8681161403656006, + "num_tokens": 383473007.0, + "step": 10519 + }, + { + "epoch": 1.9535747446610956, + "grad_norm": 1.5078954696655273, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8829763531684875, + "num_tokens": 383508524.0, + "step": 10520 + }, + { + "epoch": 1.9537604456824513, + "grad_norm": 1.6959147453308105, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8764718770980835, + "num_tokens": 383539585.0, + "step": 10521 + }, + { + "epoch": 1.9539461467038068, + "grad_norm": 1.5642151832580566, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8863546252250671, + "num_tokens": 383574470.0, + "step": 10522 + }, + { + "epoch": 1.9541318477251624, + "grad_norm": 1.4618158340454102, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8910133242607117, + "num_tokens": 383610898.0, + "step": 10523 + }, + { + "epoch": 1.954317548746518, + "grad_norm": 1.3619089126586914, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8781512975692749, + "num_tokens": 383654345.0, + "step": 10524 + }, + { + "epoch": 1.9545032497678738, + "grad_norm": 1.5640907287597656, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8776763677597046, + "num_tokens": 383691196.0, + "step": 10525 + }, + { + "epoch": 1.9546889507892293, + "grad_norm": 1.6255638599395752, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8813520073890686, + "num_tokens": 383725215.0, + "step": 10526 + }, + { + "epoch": 1.9548746518105848, + "grad_norm": 1.4901392459869385, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8897475004196167, + "num_tokens": 383759208.0, + "step": 10527 + }, + { + "epoch": 1.9550603528319406, + "grad_norm": 1.6556726694107056, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.864199161529541, + "num_tokens": 383794252.0, + "step": 10528 + }, + { + "epoch": 1.9552460538532963, + "grad_norm": 1.692232370376587, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8736859560012817, + "num_tokens": 383827666.0, + "step": 10529 + }, + { + "epoch": 1.9554317548746518, + "grad_norm": 1.4217047691345215, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.87877357006073, + "num_tokens": 383870725.0, + "step": 10530 + }, + { + "epoch": 1.9556174558960073, + "grad_norm": 1.4580748081207275, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8765792846679688, + "num_tokens": 383909602.0, + "step": 10531 + }, + { + "epoch": 1.955803156917363, + "grad_norm": 1.5775322914123535, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8771079778671265, + "num_tokens": 383945618.0, + "step": 10532 + }, + { + "epoch": 1.9559888579387188, + "grad_norm": 1.3994659185409546, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8786395192146301, + "num_tokens": 383987521.0, + "step": 10533 + }, + { + "epoch": 1.9561745589600743, + "grad_norm": 1.4924355745315552, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8879969120025635, + "num_tokens": 384024320.0, + "step": 10534 + }, + { + "epoch": 1.9563602599814298, + "grad_norm": 1.547249436378479, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8851630687713623, + "num_tokens": 384060587.0, + "step": 10535 + }, + { + "epoch": 1.9565459610027855, + "grad_norm": 1.5624228715896606, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8728815317153931, + "num_tokens": 384096704.0, + "step": 10536 + }, + { + "epoch": 1.9567316620241413, + "grad_norm": 1.5638442039489746, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8787460327148438, + "num_tokens": 384129419.0, + "step": 10537 + }, + { + "epoch": 1.9569173630454968, + "grad_norm": 1.5197787284851074, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8773231506347656, + "num_tokens": 384167010.0, + "step": 10538 + }, + { + "epoch": 1.9571030640668523, + "grad_norm": 1.411332130432129, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.876246988773346, + "num_tokens": 384207963.0, + "step": 10539 + }, + { + "epoch": 1.957288765088208, + "grad_norm": 1.4945183992385864, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8826102018356323, + "num_tokens": 384243546.0, + "step": 10540 + }, + { + "epoch": 1.9574744661095635, + "grad_norm": 1.6344306468963623, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8902412056922913, + "num_tokens": 384273487.0, + "step": 10541 + }, + { + "epoch": 1.957660167130919, + "grad_norm": 1.501810908317566, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8845310211181641, + "num_tokens": 384310885.0, + "step": 10542 + }, + { + "epoch": 1.9578458681522748, + "grad_norm": 1.529958724975586, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8667057752609253, + "num_tokens": 384352671.0, + "step": 10543 + }, + { + "epoch": 1.9580315691736305, + "grad_norm": 1.4209703207015991, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8892612457275391, + "num_tokens": 384391131.0, + "step": 10544 + }, + { + "epoch": 1.958217270194986, + "grad_norm": 1.4851588010787964, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8880360126495361, + "num_tokens": 384427925.0, + "step": 10545 + }, + { + "epoch": 1.9584029712163415, + "grad_norm": 1.5423243045806885, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8785619139671326, + "num_tokens": 384461952.0, + "step": 10546 + }, + { + "epoch": 1.9585886722376973, + "grad_norm": 1.604471206665039, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8798611760139465, + "num_tokens": 384495991.0, + "step": 10547 + }, + { + "epoch": 1.958774373259053, + "grad_norm": 1.6626092195510864, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8834902048110962, + "num_tokens": 384525749.0, + "step": 10548 + }, + { + "epoch": 1.9589600742804085, + "grad_norm": 1.5335578918457031, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8868736028671265, + "num_tokens": 384558413.0, + "step": 10549 + }, + { + "epoch": 1.959145775301764, + "grad_norm": 1.4668909311294556, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8934022784233093, + "num_tokens": 384594806.0, + "step": 10550 + }, + { + "epoch": 1.9593314763231198, + "grad_norm": 1.690581202507019, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8651422262191772, + "num_tokens": 384628217.0, + "step": 10551 + }, + { + "epoch": 1.9595171773444755, + "grad_norm": 1.4244277477264404, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8902647495269775, + "num_tokens": 384663460.0, + "step": 10552 + }, + { + "epoch": 1.959702878365831, + "grad_norm": 1.4338574409484863, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8781507611274719, + "num_tokens": 384705021.0, + "step": 10553 + }, + { + "epoch": 1.9598885793871865, + "grad_norm": 1.6341243982315063, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8784158825874329, + "num_tokens": 384739497.0, + "step": 10554 + }, + { + "epoch": 1.9600742804085423, + "grad_norm": 1.481776237487793, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8798120021820068, + "num_tokens": 384777279.0, + "step": 10555 + }, + { + "epoch": 1.960259981429898, + "grad_norm": 1.41569185256958, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8646349906921387, + "num_tokens": 384822114.0, + "step": 10556 + }, + { + "epoch": 1.9604456824512535, + "grad_norm": 1.5271222591400146, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8711202144622803, + "num_tokens": 384859107.0, + "step": 10557 + }, + { + "epoch": 1.960631383472609, + "grad_norm": 1.5489846467971802, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8680798411369324, + "num_tokens": 384895361.0, + "step": 10558 + }, + { + "epoch": 1.9608170844939647, + "grad_norm": 1.5602829456329346, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8744646906852722, + "num_tokens": 384931912.0, + "step": 10559 + }, + { + "epoch": 1.9610027855153205, + "grad_norm": 1.546669363975525, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.876990795135498, + "num_tokens": 384967495.0, + "step": 10560 + }, + { + "epoch": 1.961188486536676, + "grad_norm": 1.6334768533706665, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8757639527320862, + "num_tokens": 385000555.0, + "step": 10561 + }, + { + "epoch": 1.9613741875580315, + "grad_norm": 1.5611988306045532, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8627304434776306, + "num_tokens": 385037665.0, + "step": 10562 + }, + { + "epoch": 1.9615598885793872, + "grad_norm": 1.3850646018981934, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8842353820800781, + "num_tokens": 385079629.0, + "step": 10563 + }, + { + "epoch": 1.961745589600743, + "grad_norm": 1.4233413934707642, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8809270262718201, + "num_tokens": 385119285.0, + "step": 10564 + }, + { + "epoch": 1.9619312906220983, + "grad_norm": 1.5140095949172974, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8822476863861084, + "num_tokens": 385162592.0, + "step": 10565 + }, + { + "epoch": 1.962116991643454, + "grad_norm": 1.4899965524673462, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8893080949783325, + "num_tokens": 385198714.0, + "step": 10566 + }, + { + "epoch": 1.9623026926648097, + "grad_norm": 1.4967374801635742, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8766688108444214, + "num_tokens": 385235887.0, + "step": 10567 + }, + { + "epoch": 1.9624883936861652, + "grad_norm": 1.811013102531433, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8716375827789307, + "num_tokens": 385264442.0, + "step": 10568 + }, + { + "epoch": 1.9626740947075207, + "grad_norm": 1.4677953720092773, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8758845329284668, + "num_tokens": 385301537.0, + "step": 10569 + }, + { + "epoch": 1.9628597957288765, + "grad_norm": 1.5171302556991577, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8765153288841248, + "num_tokens": 385337526.0, + "step": 10570 + }, + { + "epoch": 1.9630454967502322, + "grad_norm": 1.3264484405517578, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.9000667333602905, + "num_tokens": 385382137.0, + "step": 10571 + }, + { + "epoch": 1.9632311977715877, + "grad_norm": 1.5397024154663086, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8681015968322754, + "num_tokens": 385420100.0, + "step": 10572 + }, + { + "epoch": 1.9634168987929432, + "grad_norm": 1.4750570058822632, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8851113319396973, + "num_tokens": 385455407.0, + "step": 10573 + }, + { + "epoch": 1.963602599814299, + "grad_norm": 1.6863000392913818, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8790234923362732, + "num_tokens": 385487372.0, + "step": 10574 + }, + { + "epoch": 1.9637883008356547, + "grad_norm": 1.5702461004257202, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8743160963058472, + "num_tokens": 385525787.0, + "step": 10575 + }, + { + "epoch": 1.9639740018570102, + "grad_norm": 1.3524290323257446, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8900301456451416, + "num_tokens": 385566460.0, + "step": 10576 + }, + { + "epoch": 1.9641597028783657, + "grad_norm": 1.6325095891952515, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8770979046821594, + "num_tokens": 385601359.0, + "step": 10577 + }, + { + "epoch": 1.9643454038997215, + "grad_norm": 1.457674503326416, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8812774419784546, + "num_tokens": 385637354.0, + "step": 10578 + }, + { + "epoch": 1.9645311049210772, + "grad_norm": 1.5843456983566284, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8793301582336426, + "num_tokens": 385670495.0, + "step": 10579 + }, + { + "epoch": 1.9647168059424327, + "grad_norm": 1.4609909057617188, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8825944662094116, + "num_tokens": 385708005.0, + "step": 10580 + }, + { + "epoch": 1.9649025069637882, + "grad_norm": 1.639020323753357, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8821339011192322, + "num_tokens": 385747574.0, + "step": 10581 + }, + { + "epoch": 1.965088207985144, + "grad_norm": 1.4736295938491821, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8860584497451782, + "num_tokens": 385785024.0, + "step": 10582 + }, + { + "epoch": 1.9652739090064997, + "grad_norm": 1.6074155569076538, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8649455308914185, + "num_tokens": 385823028.0, + "step": 10583 + }, + { + "epoch": 1.9654596100278552, + "grad_norm": 1.619911551475525, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8559771180152893, + "num_tokens": 385859212.0, + "step": 10584 + }, + { + "epoch": 1.9656453110492107, + "grad_norm": 1.577567458152771, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8823421597480774, + "num_tokens": 385894097.0, + "step": 10585 + }, + { + "epoch": 1.9658310120705664, + "grad_norm": 1.4596484899520874, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8700075745582581, + "num_tokens": 385935457.0, + "step": 10586 + }, + { + "epoch": 1.9660167130919222, + "grad_norm": 1.7970772981643677, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8669182062149048, + "num_tokens": 385967561.0, + "step": 10587 + }, + { + "epoch": 1.9662024141132777, + "grad_norm": 1.5544358491897583, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8759100437164307, + "num_tokens": 386004508.0, + "step": 10588 + }, + { + "epoch": 1.9663881151346332, + "grad_norm": 1.5557868480682373, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8831031322479248, + "num_tokens": 386040252.0, + "step": 10589 + }, + { + "epoch": 1.966573816155989, + "grad_norm": 1.6009467840194702, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8763364553451538, + "num_tokens": 386074917.0, + "step": 10590 + }, + { + "epoch": 1.9667595171773444, + "grad_norm": 1.554482102394104, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8744595050811768, + "num_tokens": 386116054.0, + "step": 10591 + }, + { + "epoch": 1.9669452181987, + "grad_norm": 1.5455963611602783, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8764646053314209, + "num_tokens": 386153427.0, + "step": 10592 + }, + { + "epoch": 1.9671309192200557, + "grad_norm": 1.497493028640747, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8755894899368286, + "num_tokens": 386191248.0, + "step": 10593 + }, + { + "epoch": 1.9673166202414114, + "grad_norm": 1.4158960580825806, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8813377618789673, + "num_tokens": 386230244.0, + "step": 10594 + }, + { + "epoch": 1.967502321262767, + "grad_norm": 1.4723438024520874, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8793129920959473, + "num_tokens": 386264575.0, + "step": 10595 + }, + { + "epoch": 1.9676880222841224, + "grad_norm": 1.5560393333435059, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8790360689163208, + "num_tokens": 386300615.0, + "step": 10596 + }, + { + "epoch": 1.9678737233054782, + "grad_norm": 1.6045328378677368, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8786848783493042, + "num_tokens": 386339835.0, + "step": 10597 + }, + { + "epoch": 1.968059424326834, + "grad_norm": 1.5134750604629517, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8839491605758667, + "num_tokens": 386376866.0, + "step": 10598 + }, + { + "epoch": 1.9682451253481894, + "grad_norm": 1.584938645362854, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8609665036201477, + "num_tokens": 386415789.0, + "step": 10599 + }, + { + "epoch": 1.968430826369545, + "grad_norm": 1.762511968612671, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8768244981765747, + "num_tokens": 386445367.0, + "step": 10600 + }, + { + "epoch": 1.9686165273909007, + "grad_norm": 1.543997883796692, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.878034234046936, + "num_tokens": 386483476.0, + "step": 10601 + }, + { + "epoch": 1.9688022284122564, + "grad_norm": 1.3990532159805298, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8866687417030334, + "num_tokens": 386521993.0, + "step": 10602 + }, + { + "epoch": 1.968987929433612, + "grad_norm": 1.4512581825256348, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8806492686271667, + "num_tokens": 386563373.0, + "step": 10603 + }, + { + "epoch": 1.9691736304549674, + "grad_norm": 1.4370683431625366, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8881290555000305, + "num_tokens": 386603446.0, + "step": 10604 + }, + { + "epoch": 1.9693593314763231, + "grad_norm": 1.4132782220840454, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8864635229110718, + "num_tokens": 386641439.0, + "step": 10605 + }, + { + "epoch": 1.9695450324976789, + "grad_norm": 1.5962977409362793, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.868756890296936, + "num_tokens": 386677087.0, + "step": 10606 + }, + { + "epoch": 1.9697307335190344, + "grad_norm": 1.506905198097229, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8768694400787354, + "num_tokens": 386716695.0, + "step": 10607 + }, + { + "epoch": 1.96991643454039, + "grad_norm": 1.4214844703674316, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8805406093597412, + "num_tokens": 386757844.0, + "step": 10608 + }, + { + "epoch": 1.9701021355617456, + "grad_norm": 1.5614746809005737, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8820698261260986, + "num_tokens": 386789858.0, + "step": 10609 + }, + { + "epoch": 1.9702878365831014, + "grad_norm": 1.6847314834594727, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8762922286987305, + "num_tokens": 386822719.0, + "step": 10610 + }, + { + "epoch": 1.9704735376044569, + "grad_norm": 1.4694328308105469, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.884576141834259, + "num_tokens": 386860171.0, + "step": 10611 + }, + { + "epoch": 1.9706592386258124, + "grad_norm": 1.6257147789001465, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8765522241592407, + "num_tokens": 386896199.0, + "step": 10612 + }, + { + "epoch": 1.9708449396471681, + "grad_norm": 1.5095282793045044, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8835080862045288, + "num_tokens": 386931412.0, + "step": 10613 + }, + { + "epoch": 1.9710306406685236, + "grad_norm": 1.550986886024475, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.869632363319397, + "num_tokens": 386968917.0, + "step": 10614 + }, + { + "epoch": 1.9712163416898791, + "grad_norm": 1.8176367282867432, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8745876550674438, + "num_tokens": 387000517.0, + "step": 10615 + }, + { + "epoch": 1.9714020427112349, + "grad_norm": 1.4061496257781982, + "learning_rate": 1e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.8961474895477295, + "num_tokens": 387038304.0, + "step": 10616 + }, + { + "epoch": 1.9715877437325906, + "grad_norm": 1.6258375644683838, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8729503750801086, + "num_tokens": 387070679.0, + "step": 10617 + }, + { + "epoch": 1.9717734447539461, + "grad_norm": 1.5702540874481201, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8808391690254211, + "num_tokens": 387104479.0, + "step": 10618 + }, + { + "epoch": 1.9719591457753016, + "grad_norm": 1.3830771446228027, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8764909505844116, + "num_tokens": 387148995.0, + "step": 10619 + }, + { + "epoch": 1.9721448467966574, + "grad_norm": 1.5711891651153564, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8833134770393372, + "num_tokens": 387188763.0, + "step": 10620 + }, + { + "epoch": 1.972330547818013, + "grad_norm": 1.5193893909454346, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.878434956073761, + "num_tokens": 387224577.0, + "step": 10621 + }, + { + "epoch": 1.9725162488393686, + "grad_norm": 1.5190383195877075, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8918249011039734, + "num_tokens": 387263169.0, + "step": 10622 + }, + { + "epoch": 1.9727019498607241, + "grad_norm": 1.3680351972579956, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8868146538734436, + "num_tokens": 387305795.0, + "step": 10623 + }, + { + "epoch": 1.9728876508820798, + "grad_norm": 1.3879666328430176, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8887395262718201, + "num_tokens": 387346275.0, + "step": 10624 + }, + { + "epoch": 1.9730733519034356, + "grad_norm": 1.5785560607910156, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8748509287834167, + "num_tokens": 387383861.0, + "step": 10625 + }, + { + "epoch": 1.973259052924791, + "grad_norm": 1.4808825254440308, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8791757822036743, + "num_tokens": 387420722.0, + "step": 10626 + }, + { + "epoch": 1.9734447539461466, + "grad_norm": 1.7253212928771973, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8608946800231934, + "num_tokens": 387452754.0, + "step": 10627 + }, + { + "epoch": 1.9736304549675023, + "grad_norm": 1.4879975318908691, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8769572973251343, + "num_tokens": 387491660.0, + "step": 10628 + }, + { + "epoch": 1.973816155988858, + "grad_norm": 1.594175934791565, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8736568689346313, + "num_tokens": 387525390.0, + "step": 10629 + }, + { + "epoch": 1.9740018570102136, + "grad_norm": 1.6463781595230103, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8666911721229553, + "num_tokens": 387557804.0, + "step": 10630 + }, + { + "epoch": 1.974187558031569, + "grad_norm": 1.5738122463226318, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8776681423187256, + "num_tokens": 387591909.0, + "step": 10631 + }, + { + "epoch": 1.9743732590529248, + "grad_norm": 1.6272687911987305, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.870742917060852, + "num_tokens": 387623641.0, + "step": 10632 + }, + { + "epoch": 1.9745589600742806, + "grad_norm": 1.519062876701355, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8937515616416931, + "num_tokens": 387661003.0, + "step": 10633 + }, + { + "epoch": 1.974744661095636, + "grad_norm": 1.7008706331253052, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8695741295814514, + "num_tokens": 387696017.0, + "step": 10634 + }, + { + "epoch": 1.9749303621169916, + "grad_norm": 1.6696991920471191, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.882924497127533, + "num_tokens": 387726141.0, + "step": 10635 + }, + { + "epoch": 1.9751160631383473, + "grad_norm": 1.5522671937942505, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8756058812141418, + "num_tokens": 387761683.0, + "step": 10636 + }, + { + "epoch": 1.975301764159703, + "grad_norm": 1.4468793869018555, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8839800357818604, + "num_tokens": 387799234.0, + "step": 10637 + }, + { + "epoch": 1.9754874651810583, + "grad_norm": 1.5275325775146484, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8896010518074036, + "num_tokens": 387831580.0, + "step": 10638 + }, + { + "epoch": 1.975673166202414, + "grad_norm": 1.5800669193267822, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8777010440826416, + "num_tokens": 387868135.0, + "step": 10639 + }, + { + "epoch": 1.9758588672237698, + "grad_norm": 1.4264403581619263, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8681572675704956, + "num_tokens": 387909112.0, + "step": 10640 + }, + { + "epoch": 1.9760445682451253, + "grad_norm": 1.59628427028656, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8711333870887756, + "num_tokens": 387945261.0, + "step": 10641 + }, + { + "epoch": 1.9762302692664808, + "grad_norm": 1.6121450662612915, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8755557537078857, + "num_tokens": 387977859.0, + "step": 10642 + }, + { + "epoch": 1.9764159702878366, + "grad_norm": 1.4003154039382935, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8825924396514893, + "num_tokens": 388019612.0, + "step": 10643 + }, + { + "epoch": 1.9766016713091923, + "grad_norm": 1.6969300508499146, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8836507201194763, + "num_tokens": 388048114.0, + "step": 10644 + }, + { + "epoch": 1.9767873723305478, + "grad_norm": 1.7756463289260864, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8786652684211731, + "num_tokens": 388079809.0, + "step": 10645 + }, + { + "epoch": 1.9769730733519033, + "grad_norm": 1.6886367797851562, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8843173980712891, + "num_tokens": 388109569.0, + "step": 10646 + }, + { + "epoch": 1.977158774373259, + "grad_norm": 1.5149940252304077, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8678380846977234, + "num_tokens": 388147718.0, + "step": 10647 + }, + { + "epoch": 1.9773444753946148, + "grad_norm": 1.5392286777496338, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8758349418640137, + "num_tokens": 388183068.0, + "step": 10648 + }, + { + "epoch": 1.9775301764159703, + "grad_norm": 1.4459824562072754, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8816576600074768, + "num_tokens": 388221876.0, + "step": 10649 + }, + { + "epoch": 1.9777158774373258, + "grad_norm": 1.5413492918014526, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8932188749313354, + "num_tokens": 388254238.0, + "step": 10650 + }, + { + "epoch": 1.9779015784586815, + "grad_norm": 1.389114260673523, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8807927370071411, + "num_tokens": 388293614.0, + "step": 10651 + }, + { + "epoch": 1.9780872794800373, + "grad_norm": 1.6192377805709839, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8894885778427124, + "num_tokens": 388325193.0, + "step": 10652 + }, + { + "epoch": 1.9782729805013928, + "grad_norm": 1.7774684429168701, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8749046325683594, + "num_tokens": 388354571.0, + "step": 10653 + }, + { + "epoch": 1.9784586815227483, + "grad_norm": 1.3812733888626099, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8949665427207947, + "num_tokens": 388395036.0, + "step": 10654 + }, + { + "epoch": 1.978644382544104, + "grad_norm": 1.4363104104995728, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8733701705932617, + "num_tokens": 388434526.0, + "step": 10655 + }, + { + "epoch": 1.9788300835654598, + "grad_norm": 1.587828516960144, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8924555778503418, + "num_tokens": 388466077.0, + "step": 10656 + }, + { + "epoch": 1.9790157845868153, + "grad_norm": 1.4260207414627075, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8822543621063232, + "num_tokens": 388508187.0, + "step": 10657 + }, + { + "epoch": 1.9792014856081708, + "grad_norm": 1.685992956161499, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8861620426177979, + "num_tokens": 388538963.0, + "step": 10658 + }, + { + "epoch": 1.9793871866295265, + "grad_norm": 1.5793325901031494, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8915389180183411, + "num_tokens": 388573183.0, + "step": 10659 + }, + { + "epoch": 1.9795728876508822, + "grad_norm": 1.4838263988494873, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8752317428588867, + "num_tokens": 388611906.0, + "step": 10660 + }, + { + "epoch": 1.9797585886722378, + "grad_norm": 1.5614384412765503, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8714369535446167, + "num_tokens": 388648844.0, + "step": 10661 + }, + { + "epoch": 1.9799442896935933, + "grad_norm": 1.6939013004302979, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8855870962142944, + "num_tokens": 388676940.0, + "step": 10662 + }, + { + "epoch": 1.980129990714949, + "grad_norm": 1.6529897451400757, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8747931718826294, + "num_tokens": 388707485.0, + "step": 10663 + }, + { + "epoch": 1.9803156917363045, + "grad_norm": 1.6351560354232788, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8640528321266174, + "num_tokens": 388744272.0, + "step": 10664 + }, + { + "epoch": 1.98050139275766, + "grad_norm": 1.7373771667480469, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8680406808853149, + "num_tokens": 388777303.0, + "step": 10665 + }, + { + "epoch": 1.9806870937790158, + "grad_norm": 1.8274097442626953, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8616026639938354, + "num_tokens": 388809841.0, + "step": 10666 + }, + { + "epoch": 1.9808727948003715, + "grad_norm": 1.3692015409469604, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8908955454826355, + "num_tokens": 388852304.0, + "step": 10667 + }, + { + "epoch": 1.981058495821727, + "grad_norm": 1.5946803092956543, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.870229959487915, + "num_tokens": 388889688.0, + "step": 10668 + }, + { + "epoch": 1.9812441968430825, + "grad_norm": 1.6119855642318726, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8685747385025024, + "num_tokens": 388926099.0, + "step": 10669 + }, + { + "epoch": 1.9814298978644382, + "grad_norm": 1.673113226890564, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8772765398025513, + "num_tokens": 388957973.0, + "step": 10670 + }, + { + "epoch": 1.981615598885794, + "grad_norm": 1.5593050718307495, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8953144550323486, + "num_tokens": 388991076.0, + "step": 10671 + }, + { + "epoch": 1.9818012999071495, + "grad_norm": 1.6217601299285889, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8804351091384888, + "num_tokens": 389024342.0, + "step": 10672 + }, + { + "epoch": 1.981987000928505, + "grad_norm": 1.5143184661865234, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8780992031097412, + "num_tokens": 389061625.0, + "step": 10673 + }, + { + "epoch": 1.9821727019498607, + "grad_norm": 1.5371062755584717, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8779239058494568, + "num_tokens": 389098817.0, + "step": 10674 + }, + { + "epoch": 1.9823584029712165, + "grad_norm": 1.706236720085144, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8547190427780151, + "num_tokens": 389132934.0, + "step": 10675 + }, + { + "epoch": 1.982544103992572, + "grad_norm": 1.4749078750610352, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8843433856964111, + "num_tokens": 389172332.0, + "step": 10676 + }, + { + "epoch": 1.9827298050139275, + "grad_norm": 1.5166347026824951, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8560411334037781, + "num_tokens": 389212881.0, + "step": 10677 + }, + { + "epoch": 1.9829155060352832, + "grad_norm": 1.4764525890350342, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8802076578140259, + "num_tokens": 389251863.0, + "step": 10678 + }, + { + "epoch": 1.983101207056639, + "grad_norm": 1.4956486225128174, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8861849904060364, + "num_tokens": 389287593.0, + "step": 10679 + }, + { + "epoch": 1.9832869080779945, + "grad_norm": 1.5944554805755615, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8722296953201294, + "num_tokens": 389323079.0, + "step": 10680 + }, + { + "epoch": 1.98347260909935, + "grad_norm": 1.4481691122055054, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8727754354476929, + "num_tokens": 389361268.0, + "step": 10681 + }, + { + "epoch": 1.9836583101207057, + "grad_norm": 1.5966004133224487, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8681638836860657, + "num_tokens": 389399432.0, + "step": 10682 + }, + { + "epoch": 1.9838440111420614, + "grad_norm": 1.5494247674942017, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8745486736297607, + "num_tokens": 389437557.0, + "step": 10683 + }, + { + "epoch": 1.984029712163417, + "grad_norm": 1.6237188577651978, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8725818395614624, + "num_tokens": 389471950.0, + "step": 10684 + }, + { + "epoch": 1.9842154131847725, + "grad_norm": 1.6471772193908691, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.885442852973938, + "num_tokens": 389504196.0, + "step": 10685 + }, + { + "epoch": 1.9844011142061282, + "grad_norm": 1.5668050050735474, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8673069477081299, + "num_tokens": 389541322.0, + "step": 10686 + }, + { + "epoch": 1.9845868152274837, + "grad_norm": 1.407271385192871, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8841159343719482, + "num_tokens": 389579689.0, + "step": 10687 + }, + { + "epoch": 1.9847725162488392, + "grad_norm": 1.5932565927505493, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.882021963596344, + "num_tokens": 389618858.0, + "step": 10688 + }, + { + "epoch": 1.984958217270195, + "grad_norm": 1.527366042137146, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8843870162963867, + "num_tokens": 389656155.0, + "step": 10689 + }, + { + "epoch": 1.9851439182915507, + "grad_norm": 1.6264957189559937, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8642992377281189, + "num_tokens": 389692367.0, + "step": 10690 + }, + { + "epoch": 1.9853296193129062, + "grad_norm": 1.5241639614105225, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8697142004966736, + "num_tokens": 389730030.0, + "step": 10691 + }, + { + "epoch": 1.9855153203342617, + "grad_norm": 1.570200800895691, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8739486336708069, + "num_tokens": 389766331.0, + "step": 10692 + }, + { + "epoch": 1.9857010213556174, + "grad_norm": 1.635145664215088, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8769516944885254, + "num_tokens": 389801114.0, + "step": 10693 + }, + { + "epoch": 1.9858867223769732, + "grad_norm": 1.5960757732391357, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8820793628692627, + "num_tokens": 389833224.0, + "step": 10694 + }, + { + "epoch": 1.9860724233983287, + "grad_norm": 1.4543136358261108, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8727892637252808, + "num_tokens": 389872820.0, + "step": 10695 + }, + { + "epoch": 1.9862581244196842, + "grad_norm": 1.4800198078155518, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8805071115493774, + "num_tokens": 389909655.0, + "step": 10696 + }, + { + "epoch": 1.98644382544104, + "grad_norm": 1.6153849363327026, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8789978623390198, + "num_tokens": 389940878.0, + "step": 10697 + }, + { + "epoch": 1.9866295264623957, + "grad_norm": 1.4611860513687134, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8833270072937012, + "num_tokens": 389979514.0, + "step": 10698 + }, + { + "epoch": 1.9868152274837512, + "grad_norm": 1.7252748012542725, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8797358274459839, + "num_tokens": 390012987.0, + "step": 10699 + }, + { + "epoch": 1.9870009285051067, + "grad_norm": 1.4778169393539429, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8897683620452881, + "num_tokens": 390050140.0, + "step": 10700 + }, + { + "epoch": 1.9871866295264624, + "grad_norm": 1.438281536102295, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8874034285545349, + "num_tokens": 390088332.0, + "step": 10701 + }, + { + "epoch": 1.9873723305478181, + "grad_norm": 1.5009419918060303, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8769401907920837, + "num_tokens": 390124937.0, + "step": 10702 + }, + { + "epoch": 1.9875580315691737, + "grad_norm": 1.4279108047485352, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8738197088241577, + "num_tokens": 390162841.0, + "step": 10703 + }, + { + "epoch": 1.9877437325905292, + "grad_norm": 1.3112643957138062, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8918024301528931, + "num_tokens": 390206953.0, + "step": 10704 + }, + { + "epoch": 1.987929433611885, + "grad_norm": 1.6726744174957275, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8834301829338074, + "num_tokens": 390236633.0, + "step": 10705 + }, + { + "epoch": 1.9881151346332406, + "grad_norm": 1.5326200723648071, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8627247214317322, + "num_tokens": 390273924.0, + "step": 10706 + }, + { + "epoch": 1.9883008356545961, + "grad_norm": 1.4874601364135742, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8694972395896912, + "num_tokens": 390313813.0, + "step": 10707 + }, + { + "epoch": 1.9884865366759517, + "grad_norm": 1.5778378248214722, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8818132877349854, + "num_tokens": 390347383.0, + "step": 10708 + }, + { + "epoch": 1.9886722376973074, + "grad_norm": 1.4557039737701416, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8916750550270081, + "num_tokens": 390381569.0, + "step": 10709 + }, + { + "epoch": 1.988857938718663, + "grad_norm": 1.5442190170288086, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8759037256240845, + "num_tokens": 390418772.0, + "step": 10710 + }, + { + "epoch": 1.9890436397400184, + "grad_norm": 1.5368461608886719, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8780681490898132, + "num_tokens": 390452993.0, + "step": 10711 + }, + { + "epoch": 1.9892293407613741, + "grad_norm": 1.6287003755569458, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8568054437637329, + "num_tokens": 390491670.0, + "step": 10712 + }, + { + "epoch": 1.9894150417827299, + "grad_norm": 1.3963546752929688, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8903381824493408, + "num_tokens": 390532564.0, + "step": 10713 + }, + { + "epoch": 1.9896007428040854, + "grad_norm": 1.5971204042434692, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8810005187988281, + "num_tokens": 390565752.0, + "step": 10714 + }, + { + "epoch": 1.989786443825441, + "grad_norm": 1.6110590696334839, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8644629120826721, + "num_tokens": 390600492.0, + "step": 10715 + }, + { + "epoch": 1.9899721448467966, + "grad_norm": 1.4003756046295166, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8749692440032959, + "num_tokens": 390641855.0, + "step": 10716 + }, + { + "epoch": 1.9901578458681524, + "grad_norm": 1.4762849807739258, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8893846273422241, + "num_tokens": 390677744.0, + "step": 10717 + }, + { + "epoch": 1.9903435468895079, + "grad_norm": 1.4541590213775635, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8717875480651855, + "num_tokens": 390717352.0, + "step": 10718 + }, + { + "epoch": 1.9905292479108634, + "grad_norm": 1.7287808656692505, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.874943733215332, + "num_tokens": 390747309.0, + "step": 10719 + }, + { + "epoch": 1.9907149489322191, + "grad_norm": 1.6919357776641846, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8624929189682007, + "num_tokens": 390781263.0, + "step": 10720 + }, + { + "epoch": 1.9909006499535749, + "grad_norm": 1.5266231298446655, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8785523772239685, + "num_tokens": 390819293.0, + "step": 10721 + }, + { + "epoch": 1.9910863509749304, + "grad_norm": 1.4799400568008423, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8722749948501587, + "num_tokens": 390857975.0, + "step": 10722 + }, + { + "epoch": 1.9912720519962859, + "grad_norm": 1.4996455907821655, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8886063098907471, + "num_tokens": 390896548.0, + "step": 10723 + }, + { + "epoch": 1.9914577530176416, + "grad_norm": 1.6152703762054443, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8726261258125305, + "num_tokens": 390930212.0, + "step": 10724 + }, + { + "epoch": 1.9916434540389973, + "grad_norm": 1.476450800895691, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8778225183486938, + "num_tokens": 390967387.0, + "step": 10725 + }, + { + "epoch": 1.9918291550603529, + "grad_norm": 1.4866713285446167, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.881623387336731, + "num_tokens": 391004273.0, + "step": 10726 + }, + { + "epoch": 1.9920148560817084, + "grad_norm": 1.557522177696228, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8681043386459351, + "num_tokens": 391041251.0, + "step": 10727 + }, + { + "epoch": 1.992200557103064, + "grad_norm": 1.5317802429199219, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8803513050079346, + "num_tokens": 391076768.0, + "step": 10728 + }, + { + "epoch": 1.9923862581244198, + "grad_norm": 1.423466682434082, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8812741041183472, + "num_tokens": 391115121.0, + "step": 10729 + }, + { + "epoch": 1.9925719591457753, + "grad_norm": 1.4195533990859985, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8883833289146423, + "num_tokens": 391153052.0, + "step": 10730 + }, + { + "epoch": 1.9927576601671309, + "grad_norm": 1.5689467191696167, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8669784665107727, + "num_tokens": 391188485.0, + "step": 10731 + }, + { + "epoch": 1.9929433611884866, + "grad_norm": 1.5262104272842407, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8908077478408813, + "num_tokens": 391222700.0, + "step": 10732 + }, + { + "epoch": 1.9931290622098423, + "grad_norm": 1.4800670146942139, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8759070634841919, + "num_tokens": 391260668.0, + "step": 10733 + }, + { + "epoch": 1.9933147632311976, + "grad_norm": 1.6875883340835571, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8709208965301514, + "num_tokens": 391295249.0, + "step": 10734 + }, + { + "epoch": 1.9935004642525533, + "grad_norm": 1.4495477676391602, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8786629438400269, + "num_tokens": 391334997.0, + "step": 10735 + }, + { + "epoch": 1.993686165273909, + "grad_norm": 1.4227471351623535, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.893241822719574, + "num_tokens": 391370269.0, + "step": 10736 + }, + { + "epoch": 1.9938718662952646, + "grad_norm": 1.4554699659347534, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.885714054107666, + "num_tokens": 391403258.0, + "step": 10737 + }, + { + "epoch": 1.99405756731662, + "grad_norm": 1.4790608882904053, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.877698540687561, + "num_tokens": 391443384.0, + "step": 10738 + }, + { + "epoch": 1.9942432683379758, + "grad_norm": 1.6248427629470825, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8752295970916748, + "num_tokens": 391476712.0, + "step": 10739 + }, + { + "epoch": 1.9944289693593316, + "grad_norm": 1.3891186714172363, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.878501296043396, + "num_tokens": 391521060.0, + "step": 10740 + }, + { + "epoch": 1.994614670380687, + "grad_norm": 1.273025631904602, + "learning_rate": 1e-06, + "loss": 0.2605, + "mean_token_accuracy": 0.9080898761749268, + "num_tokens": 391562532.0, + "step": 10741 + }, + { + "epoch": 1.9948003714020426, + "grad_norm": 1.4954214096069336, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8689733743667603, + "num_tokens": 391602044.0, + "step": 10742 + }, + { + "epoch": 1.9949860724233983, + "grad_norm": 1.4606603384017944, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8712389469146729, + "num_tokens": 391641883.0, + "step": 10743 + }, + { + "epoch": 1.995171773444754, + "grad_norm": 1.6483204364776611, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8680993318557739, + "num_tokens": 391674418.0, + "step": 10744 + }, + { + "epoch": 1.9953574744661096, + "grad_norm": 1.5449333190917969, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8942645788192749, + "num_tokens": 391706329.0, + "step": 10745 + }, + { + "epoch": 1.995543175487465, + "grad_norm": 1.3677371740341187, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8815730214118958, + "num_tokens": 391749359.0, + "step": 10746 + }, + { + "epoch": 1.9957288765088208, + "grad_norm": 1.5534635782241821, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8718174695968628, + "num_tokens": 391783340.0, + "step": 10747 + }, + { + "epoch": 1.9959145775301765, + "grad_norm": 1.5871036052703857, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8763208389282227, + "num_tokens": 391818124.0, + "step": 10748 + }, + { + "epoch": 1.996100278551532, + "grad_norm": 1.6162004470825195, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8665463924407959, + "num_tokens": 391852863.0, + "step": 10749 + }, + { + "epoch": 1.9962859795728876, + "grad_norm": 1.4809426069259644, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8623752593994141, + "num_tokens": 391896031.0, + "step": 10750 + }, + { + "epoch": 1.9964716805942433, + "grad_norm": 1.6063166856765747, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8800040483474731, + "num_tokens": 391928459.0, + "step": 10751 + }, + { + "epoch": 1.996657381615599, + "grad_norm": 1.5798946619033813, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8825358152389526, + "num_tokens": 391962070.0, + "step": 10752 + }, + { + "epoch": 1.9968430826369545, + "grad_norm": 1.7162963151931763, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8763654232025146, + "num_tokens": 391997907.0, + "step": 10753 + }, + { + "epoch": 1.99702878365831, + "grad_norm": 1.3976763486862183, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8848652839660645, + "num_tokens": 392038649.0, + "step": 10754 + }, + { + "epoch": 1.9972144846796658, + "grad_norm": 1.4958575963974, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8910050392150879, + "num_tokens": 392071275.0, + "step": 10755 + }, + { + "epoch": 1.9974001857010215, + "grad_norm": 1.506563425064087, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8565182685852051, + "num_tokens": 392114493.0, + "step": 10756 + }, + { + "epoch": 1.997585886722377, + "grad_norm": 1.5192334651947021, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8818768262863159, + "num_tokens": 392149343.0, + "step": 10757 + }, + { + "epoch": 1.9977715877437325, + "grad_norm": 1.4888601303100586, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8889641761779785, + "num_tokens": 392190062.0, + "step": 10758 + }, + { + "epoch": 1.9979572887650883, + "grad_norm": 1.6409680843353271, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8810362815856934, + "num_tokens": 392224730.0, + "step": 10759 + }, + { + "epoch": 1.9981429897864438, + "grad_norm": 1.6199272871017456, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8850436210632324, + "num_tokens": 392257716.0, + "step": 10760 + }, + { + "epoch": 1.9983286908077993, + "grad_norm": 1.545701265335083, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8743795156478882, + "num_tokens": 392294793.0, + "step": 10761 + }, + { + "epoch": 1.998514391829155, + "grad_norm": 1.647356390953064, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8940615653991699, + "num_tokens": 392325277.0, + "step": 10762 + }, + { + "epoch": 1.9987000928505108, + "grad_norm": 1.4501545429229736, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8898797631263733, + "num_tokens": 392359133.0, + "step": 10763 + }, + { + "epoch": 1.9988857938718663, + "grad_norm": 1.5960125923156738, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8715622425079346, + "num_tokens": 392392672.0, + "step": 10764 + }, + { + "epoch": 1.9990714948932218, + "grad_norm": 1.7567051649093628, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8680539131164551, + "num_tokens": 392423862.0, + "step": 10765 + }, + { + "epoch": 1.9992571959145775, + "grad_norm": 1.4944804906845093, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8716146945953369, + "num_tokens": 392462447.0, + "step": 10766 + }, + { + "epoch": 1.9994428969359332, + "grad_norm": 1.5269140005111694, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8844513893127441, + "num_tokens": 392497626.0, + "step": 10767 + }, + { + "epoch": 1.9996285979572888, + "grad_norm": 1.535066843032837, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8705165386199951, + "num_tokens": 392532002.0, + "step": 10768 + }, + { + "epoch": 1.9998142989786443, + "grad_norm": 1.5921173095703125, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8844282627105713, + "num_tokens": 392564213.0, + "step": 10769 + }, + { + "epoch": 2.0, + "grad_norm": 1.6401822566986084, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8780713081359863, + "num_tokens": 392597395.0, + "step": 10770 + }, + { + "epoch": 2.0001857010213557, + "grad_norm": 1.4931284189224243, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8892332911491394, + "num_tokens": 392634857.0, + "step": 10771 + }, + { + "epoch": 2.000371402042711, + "grad_norm": 1.4543534517288208, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8885159492492676, + "num_tokens": 392673582.0, + "step": 10772 + }, + { + "epoch": 2.0005571030640668, + "grad_norm": 1.5108251571655273, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.9007832407951355, + "num_tokens": 392709140.0, + "step": 10773 + }, + { + "epoch": 2.0007428040854225, + "grad_norm": 1.4627865552902222, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8944653272628784, + "num_tokens": 392747641.0, + "step": 10774 + }, + { + "epoch": 2.000928505106778, + "grad_norm": 1.4838294982910156, + "learning_rate": 1e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.8988832831382751, + "num_tokens": 392781244.0, + "step": 10775 + }, + { + "epoch": 2.0011142061281335, + "grad_norm": 1.4489752054214478, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8817600011825562, + "num_tokens": 392820143.0, + "step": 10776 + }, + { + "epoch": 2.0012999071494892, + "grad_norm": 1.5591390132904053, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8805333375930786, + "num_tokens": 392856719.0, + "step": 10777 + }, + { + "epoch": 2.001485608170845, + "grad_norm": 1.4654444456100464, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8668462634086609, + "num_tokens": 392900048.0, + "step": 10778 + }, + { + "epoch": 2.0016713091922007, + "grad_norm": 1.7013155221939087, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8859477043151855, + "num_tokens": 392933375.0, + "step": 10779 + }, + { + "epoch": 2.001857010213556, + "grad_norm": 1.464508056640625, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8919956684112549, + "num_tokens": 392973074.0, + "step": 10780 + }, + { + "epoch": 2.0020427112349117, + "grad_norm": 1.8304907083511353, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8783658146858215, + "num_tokens": 393004500.0, + "step": 10781 + }, + { + "epoch": 2.0022284122562675, + "grad_norm": 1.6368390321731567, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8859298229217529, + "num_tokens": 393040020.0, + "step": 10782 + }, + { + "epoch": 2.002414113277623, + "grad_norm": 1.6613433361053467, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8744319081306458, + "num_tokens": 393075646.0, + "step": 10783 + }, + { + "epoch": 2.0025998142989785, + "grad_norm": 1.4352054595947266, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8912561535835266, + "num_tokens": 393115905.0, + "step": 10784 + }, + { + "epoch": 2.002785515320334, + "grad_norm": 1.5145269632339478, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8900715708732605, + "num_tokens": 393155374.0, + "step": 10785 + }, + { + "epoch": 2.00297121634169, + "grad_norm": 1.6288845539093018, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8814961910247803, + "num_tokens": 393194563.0, + "step": 10786 + }, + { + "epoch": 2.0031569173630457, + "grad_norm": 1.6649409532546997, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8789616823196411, + "num_tokens": 393227758.0, + "step": 10787 + }, + { + "epoch": 2.003342618384401, + "grad_norm": 1.6997805833816528, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8868156671524048, + "num_tokens": 393258331.0, + "step": 10788 + }, + { + "epoch": 2.0035283194057567, + "grad_norm": 1.6780043840408325, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8959739208221436, + "num_tokens": 393294058.0, + "step": 10789 + }, + { + "epoch": 2.0037140204271124, + "grad_norm": 1.5651854276657104, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8854838609695435, + "num_tokens": 393330825.0, + "step": 10790 + }, + { + "epoch": 2.003899721448468, + "grad_norm": 1.6289985179901123, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8842921257019043, + "num_tokens": 393367636.0, + "step": 10791 + }, + { + "epoch": 2.0040854224698235, + "grad_norm": 1.5830684900283813, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8829440474510193, + "num_tokens": 393403470.0, + "step": 10792 + }, + { + "epoch": 2.004271123491179, + "grad_norm": 1.5876721143722534, + "learning_rate": 1e-06, + "loss": 0.27, + "mean_token_accuracy": 0.9015064239501953, + "num_tokens": 393434770.0, + "step": 10793 + }, + { + "epoch": 2.004456824512535, + "grad_norm": 1.5291942358016968, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8853076696395874, + "num_tokens": 393472720.0, + "step": 10794 + }, + { + "epoch": 2.00464252553389, + "grad_norm": 1.7106956243515015, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.878587007522583, + "num_tokens": 393503009.0, + "step": 10795 + }, + { + "epoch": 2.004828226555246, + "grad_norm": 1.7679579257965088, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.87058424949646, + "num_tokens": 393534947.0, + "step": 10796 + }, + { + "epoch": 2.0050139275766017, + "grad_norm": 1.8477829694747925, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8898283243179321, + "num_tokens": 393565899.0, + "step": 10797 + }, + { + "epoch": 2.0051996285979574, + "grad_norm": 1.6569932699203491, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8925189971923828, + "num_tokens": 393598078.0, + "step": 10798 + }, + { + "epoch": 2.0053853296193127, + "grad_norm": 1.3713797330856323, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8985251784324646, + "num_tokens": 393638675.0, + "step": 10799 + }, + { + "epoch": 2.0055710306406684, + "grad_norm": 1.7137891054153442, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8844775557518005, + "num_tokens": 393672726.0, + "step": 10800 + }, + { + "epoch": 2.005756731662024, + "grad_norm": 1.3984659910202026, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.890349268913269, + "num_tokens": 393715006.0, + "step": 10801 + }, + { + "epoch": 2.00594243268338, + "grad_norm": 1.4231586456298828, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8962324857711792, + "num_tokens": 393754859.0, + "step": 10802 + }, + { + "epoch": 2.006128133704735, + "grad_norm": 1.6059592962265015, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.891725480556488, + "num_tokens": 393790440.0, + "step": 10803 + }, + { + "epoch": 2.006313834726091, + "grad_norm": 1.5190585851669312, + "learning_rate": 1e-06, + "loss": 0.2435, + "mean_token_accuracy": 0.911967396736145, + "num_tokens": 393821445.0, + "step": 10804 + }, + { + "epoch": 2.0064995357474467, + "grad_norm": 1.578690528869629, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8879894018173218, + "num_tokens": 393858835.0, + "step": 10805 + }, + { + "epoch": 2.0066852367688024, + "grad_norm": 1.7337840795516968, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8876750469207764, + "num_tokens": 393889098.0, + "step": 10806 + }, + { + "epoch": 2.0068709377901577, + "grad_norm": 1.4256834983825684, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8876686692237854, + "num_tokens": 393933156.0, + "step": 10807 + }, + { + "epoch": 2.0070566388115134, + "grad_norm": 1.5183768272399902, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8834365010261536, + "num_tokens": 393972857.0, + "step": 10808 + }, + { + "epoch": 2.007242339832869, + "grad_norm": 1.377234697341919, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.895289421081543, + "num_tokens": 394015976.0, + "step": 10809 + }, + { + "epoch": 2.007428040854225, + "grad_norm": 1.59360933303833, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8864425420761108, + "num_tokens": 394052897.0, + "step": 10810 + }, + { + "epoch": 2.00761374187558, + "grad_norm": 1.8274072408676147, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8807292580604553, + "num_tokens": 394082348.0, + "step": 10811 + }, + { + "epoch": 2.007799442896936, + "grad_norm": 1.7108500003814697, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8860939741134644, + "num_tokens": 394117442.0, + "step": 10812 + }, + { + "epoch": 2.0079851439182916, + "grad_norm": 1.595569133758545, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8767328262329102, + "num_tokens": 394155155.0, + "step": 10813 + }, + { + "epoch": 2.0081708449396474, + "grad_norm": 1.588866114616394, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.9001907110214233, + "num_tokens": 394188239.0, + "step": 10814 + }, + { + "epoch": 2.0083565459610027, + "grad_norm": 1.4423588514328003, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.8948469758033752, + "num_tokens": 394228734.0, + "step": 10815 + }, + { + "epoch": 2.0085422469823584, + "grad_norm": 1.5568351745605469, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8946887850761414, + "num_tokens": 394265282.0, + "step": 10816 + }, + { + "epoch": 2.008727948003714, + "grad_norm": 1.5382966995239258, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8788836002349854, + "num_tokens": 394305824.0, + "step": 10817 + }, + { + "epoch": 2.00891364902507, + "grad_norm": 1.6197636127471924, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8841714262962341, + "num_tokens": 394340382.0, + "step": 10818 + }, + { + "epoch": 2.009099350046425, + "grad_norm": 1.5386052131652832, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8880349397659302, + "num_tokens": 394379678.0, + "step": 10819 + }, + { + "epoch": 2.009285051067781, + "grad_norm": 1.5518321990966797, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8957653045654297, + "num_tokens": 394414111.0, + "step": 10820 + }, + { + "epoch": 2.0094707520891366, + "grad_norm": 1.6583982706069946, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8861435055732727, + "num_tokens": 394447345.0, + "step": 10821 + }, + { + "epoch": 2.009656453110492, + "grad_norm": 1.585891604423523, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8934463858604431, + "num_tokens": 394479615.0, + "step": 10822 + }, + { + "epoch": 2.0098421541318476, + "grad_norm": 1.5824190378189087, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8947926163673401, + "num_tokens": 394510695.0, + "step": 10823 + }, + { + "epoch": 2.0100278551532034, + "grad_norm": 1.8194187879562378, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8841350674629211, + "num_tokens": 394540161.0, + "step": 10824 + }, + { + "epoch": 2.010213556174559, + "grad_norm": 1.5202546119689941, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8875604867935181, + "num_tokens": 394577067.0, + "step": 10825 + }, + { + "epoch": 2.0103992571959144, + "grad_norm": 1.5639630556106567, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8823889493942261, + "num_tokens": 394616181.0, + "step": 10826 + }, + { + "epoch": 2.01058495821727, + "grad_norm": 1.4262923002243042, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8891613483428955, + "num_tokens": 394657959.0, + "step": 10827 + }, + { + "epoch": 2.010770659238626, + "grad_norm": 1.5906318426132202, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8816637992858887, + "num_tokens": 394695272.0, + "step": 10828 + }, + { + "epoch": 2.0109563602599816, + "grad_norm": 1.6478790044784546, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8741379976272583, + "num_tokens": 394731121.0, + "step": 10829 + }, + { + "epoch": 2.011142061281337, + "grad_norm": 1.4754552841186523, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8932173252105713, + "num_tokens": 394768307.0, + "step": 10830 + }, + { + "epoch": 2.0113277623026926, + "grad_norm": 1.4073450565338135, + "learning_rate": 1e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.9020735025405884, + "num_tokens": 394809423.0, + "step": 10831 + }, + { + "epoch": 2.0115134633240483, + "grad_norm": 1.5566877126693726, + "learning_rate": 1e-06, + "loss": 0.2606, + "mean_token_accuracy": 0.9073169827461243, + "num_tokens": 394843475.0, + "step": 10832 + }, + { + "epoch": 2.011699164345404, + "grad_norm": 1.5610164403915405, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8906913995742798, + "num_tokens": 394880371.0, + "step": 10833 + }, + { + "epoch": 2.0118848653667594, + "grad_norm": 1.5791478157043457, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8901224136352539, + "num_tokens": 394916253.0, + "step": 10834 + }, + { + "epoch": 2.012070566388115, + "grad_norm": 1.5747064352035522, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8828030824661255, + "num_tokens": 394954757.0, + "step": 10835 + }, + { + "epoch": 2.012256267409471, + "grad_norm": 1.526902675628662, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8831615447998047, + "num_tokens": 394995902.0, + "step": 10836 + }, + { + "epoch": 2.0124419684308266, + "grad_norm": 1.569369912147522, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8801623582839966, + "num_tokens": 395033171.0, + "step": 10837 + }, + { + "epoch": 2.012627669452182, + "grad_norm": 1.521669626235962, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8792629837989807, + "num_tokens": 395070570.0, + "step": 10838 + }, + { + "epoch": 2.0128133704735376, + "grad_norm": 1.3980050086975098, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8916938304901123, + "num_tokens": 395113270.0, + "step": 10839 + }, + { + "epoch": 2.0129990714948933, + "grad_norm": 1.6365792751312256, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8856574296951294, + "num_tokens": 395147524.0, + "step": 10840 + }, + { + "epoch": 2.013184772516249, + "grad_norm": 1.5382139682769775, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8859660625457764, + "num_tokens": 395184013.0, + "step": 10841 + }, + { + "epoch": 2.0133704735376043, + "grad_norm": 1.5594159364700317, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8799642324447632, + "num_tokens": 395223772.0, + "step": 10842 + }, + { + "epoch": 2.01355617455896, + "grad_norm": 1.6811175346374512, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8816103935241699, + "num_tokens": 395258073.0, + "step": 10843 + }, + { + "epoch": 2.013741875580316, + "grad_norm": 1.6323981285095215, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8935336470603943, + "num_tokens": 395292661.0, + "step": 10844 + }, + { + "epoch": 2.013927576601671, + "grad_norm": 1.5860381126403809, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8731876015663147, + "num_tokens": 395331889.0, + "step": 10845 + }, + { + "epoch": 2.014113277623027, + "grad_norm": 1.7025375366210938, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.879307746887207, + "num_tokens": 395363570.0, + "step": 10846 + }, + { + "epoch": 2.0142989786443826, + "grad_norm": 1.455651879310608, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8970421552658081, + "num_tokens": 395403050.0, + "step": 10847 + }, + { + "epoch": 2.0144846796657383, + "grad_norm": 1.574014663696289, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8933953642845154, + "num_tokens": 395442714.0, + "step": 10848 + }, + { + "epoch": 2.0146703806870936, + "grad_norm": 1.5447412729263306, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8992183208465576, + "num_tokens": 395481962.0, + "step": 10849 + }, + { + "epoch": 2.0148560817084493, + "grad_norm": 1.710282325744629, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8917868137359619, + "num_tokens": 395516312.0, + "step": 10850 + }, + { + "epoch": 2.015041782729805, + "grad_norm": 1.6154892444610596, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8887195587158203, + "num_tokens": 395556564.0, + "step": 10851 + }, + { + "epoch": 2.015227483751161, + "grad_norm": 1.5401771068572998, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8846776485443115, + "num_tokens": 395596567.0, + "step": 10852 + }, + { + "epoch": 2.015413184772516, + "grad_norm": 1.5420328378677368, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.897861123085022, + "num_tokens": 395631795.0, + "step": 10853 + }, + { + "epoch": 2.015598885793872, + "grad_norm": 1.5640919208526611, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8926390409469604, + "num_tokens": 395666856.0, + "step": 10854 + }, + { + "epoch": 2.0157845868152275, + "grad_norm": 1.566929817199707, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8842999935150146, + "num_tokens": 395706286.0, + "step": 10855 + }, + { + "epoch": 2.0159702878365833, + "grad_norm": 1.4491263628005981, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.8977015018463135, + "num_tokens": 395746473.0, + "step": 10856 + }, + { + "epoch": 2.0161559888579386, + "grad_norm": 1.5647472143173218, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8932281732559204, + "num_tokens": 395782050.0, + "step": 10857 + }, + { + "epoch": 2.0163416898792943, + "grad_norm": 1.49311101436615, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8864275217056274, + "num_tokens": 395822955.0, + "step": 10858 + }, + { + "epoch": 2.01652739090065, + "grad_norm": 1.5869297981262207, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8927164673805237, + "num_tokens": 395857739.0, + "step": 10859 + }, + { + "epoch": 2.0167130919220058, + "grad_norm": 1.5433741807937622, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.878830075263977, + "num_tokens": 395900311.0, + "step": 10860 + }, + { + "epoch": 2.016898792943361, + "grad_norm": 1.6555612087249756, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.876690149307251, + "num_tokens": 395936855.0, + "step": 10861 + }, + { + "epoch": 2.017084493964717, + "grad_norm": 1.5273892879486084, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8683351278305054, + "num_tokens": 395977349.0, + "step": 10862 + }, + { + "epoch": 2.0172701949860725, + "grad_norm": 1.6719448566436768, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8862759470939636, + "num_tokens": 396012599.0, + "step": 10863 + }, + { + "epoch": 2.0174558960074283, + "grad_norm": 1.629438042640686, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8773364424705505, + "num_tokens": 396051857.0, + "step": 10864 + }, + { + "epoch": 2.0176415970287835, + "grad_norm": 1.521770715713501, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8862699270248413, + "num_tokens": 396088167.0, + "step": 10865 + }, + { + "epoch": 2.0178272980501393, + "grad_norm": 1.4543085098266602, + "learning_rate": 1e-06, + "loss": 0.2771, + "mean_token_accuracy": 0.9009692668914795, + "num_tokens": 396127798.0, + "step": 10866 + }, + { + "epoch": 2.018012999071495, + "grad_norm": 1.7031059265136719, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8744441270828247, + "num_tokens": 396160539.0, + "step": 10867 + }, + { + "epoch": 2.0181987000928503, + "grad_norm": 1.5246376991271973, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.8955531120300293, + "num_tokens": 396195684.0, + "step": 10868 + }, + { + "epoch": 2.018384401114206, + "grad_norm": 1.824113368988037, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8776783347129822, + "num_tokens": 396229831.0, + "step": 10869 + }, + { + "epoch": 2.0185701021355618, + "grad_norm": 1.6091957092285156, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8845956325531006, + "num_tokens": 396266615.0, + "step": 10870 + }, + { + "epoch": 2.0187558031569175, + "grad_norm": 1.5487793684005737, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8837886452674866, + "num_tokens": 396304267.0, + "step": 10871 + }, + { + "epoch": 2.018941504178273, + "grad_norm": 1.4165698289871216, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.895980954170227, + "num_tokens": 396345512.0, + "step": 10872 + }, + { + "epoch": 2.0191272051996285, + "grad_norm": 1.4684503078460693, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8956008553504944, + "num_tokens": 396385585.0, + "step": 10873 + }, + { + "epoch": 2.0193129062209842, + "grad_norm": 1.5204031467437744, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.888585090637207, + "num_tokens": 396424706.0, + "step": 10874 + }, + { + "epoch": 2.01949860724234, + "grad_norm": 1.524491786956787, + "learning_rate": 1e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.9012816548347473, + "num_tokens": 396460962.0, + "step": 10875 + }, + { + "epoch": 2.0196843082636953, + "grad_norm": 1.5366156101226807, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8931710720062256, + "num_tokens": 396501419.0, + "step": 10876 + }, + { + "epoch": 2.019870009285051, + "grad_norm": 1.7090243101119995, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8798133134841919, + "num_tokens": 396534361.0, + "step": 10877 + }, + { + "epoch": 2.0200557103064067, + "grad_norm": 1.5492302179336548, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8823058605194092, + "num_tokens": 396576270.0, + "step": 10878 + }, + { + "epoch": 2.0202414113277625, + "grad_norm": 1.6544504165649414, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.887804388999939, + "num_tokens": 396612512.0, + "step": 10879 + }, + { + "epoch": 2.0204271123491178, + "grad_norm": 1.6636607646942139, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8820318579673767, + "num_tokens": 396646612.0, + "step": 10880 + }, + { + "epoch": 2.0206128133704735, + "grad_norm": 1.495139718055725, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.88907790184021, + "num_tokens": 396687052.0, + "step": 10881 + }, + { + "epoch": 2.0207985143918292, + "grad_norm": 1.522191047668457, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8846578001976013, + "num_tokens": 396727245.0, + "step": 10882 + }, + { + "epoch": 2.020984215413185, + "grad_norm": 1.5545878410339355, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8957535028457642, + "num_tokens": 396763876.0, + "step": 10883 + }, + { + "epoch": 2.0211699164345402, + "grad_norm": 1.590759515762329, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.885290801525116, + "num_tokens": 396799302.0, + "step": 10884 + }, + { + "epoch": 2.021355617455896, + "grad_norm": 1.559892177581787, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8889666199684143, + "num_tokens": 396836765.0, + "step": 10885 + }, + { + "epoch": 2.0215413184772517, + "grad_norm": 1.659803867340088, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8665446043014526, + "num_tokens": 396874918.0, + "step": 10886 + }, + { + "epoch": 2.0217270194986074, + "grad_norm": 1.4491257667541504, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8870283365249634, + "num_tokens": 396913712.0, + "step": 10887 + }, + { + "epoch": 2.0219127205199627, + "grad_norm": 1.6059609651565552, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8795702457427979, + "num_tokens": 396950280.0, + "step": 10888 + }, + { + "epoch": 2.0220984215413185, + "grad_norm": 1.4951950311660767, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8792505264282227, + "num_tokens": 396991000.0, + "step": 10889 + }, + { + "epoch": 2.022284122562674, + "grad_norm": 1.4757617712020874, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8959686160087585, + "num_tokens": 397029001.0, + "step": 10890 + }, + { + "epoch": 2.0224698235840295, + "grad_norm": 1.4520199298858643, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8867747783660889, + "num_tokens": 397069317.0, + "step": 10891 + }, + { + "epoch": 2.0226555246053852, + "grad_norm": 1.5405527353286743, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.89576256275177, + "num_tokens": 397105624.0, + "step": 10892 + }, + { + "epoch": 2.022841225626741, + "grad_norm": 1.6385904550552368, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.898338258266449, + "num_tokens": 397138285.0, + "step": 10893 + }, + { + "epoch": 2.0230269266480967, + "grad_norm": 1.7487390041351318, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8722093105316162, + "num_tokens": 397174873.0, + "step": 10894 + }, + { + "epoch": 2.023212627669452, + "grad_norm": 1.55515718460083, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.887677788734436, + "num_tokens": 397212555.0, + "step": 10895 + }, + { + "epoch": 2.0233983286908077, + "grad_norm": 1.7404731512069702, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.885797381401062, + "num_tokens": 397243629.0, + "step": 10896 + }, + { + "epoch": 2.0235840297121634, + "grad_norm": 1.6365305185317993, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8854089379310608, + "num_tokens": 397279634.0, + "step": 10897 + }, + { + "epoch": 2.023769730733519, + "grad_norm": 1.446765422821045, + "learning_rate": 1e-06, + "loss": 0.2539, + "mean_token_accuracy": 0.9071528315544128, + "num_tokens": 397317870.0, + "step": 10898 + }, + { + "epoch": 2.0239554317548745, + "grad_norm": 1.5666857957839966, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8976411819458008, + "num_tokens": 397353286.0, + "step": 10899 + }, + { + "epoch": 2.02414113277623, + "grad_norm": 1.6513217687606812, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.887368381023407, + "num_tokens": 397389615.0, + "step": 10900 + }, + { + "epoch": 2.024326833797586, + "grad_norm": 1.5580055713653564, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8916476964950562, + "num_tokens": 397428656.0, + "step": 10901 + }, + { + "epoch": 2.0245125348189417, + "grad_norm": 1.6624815464019775, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8793098330497742, + "num_tokens": 397469307.0, + "step": 10902 + }, + { + "epoch": 2.024698235840297, + "grad_norm": 1.4935230016708374, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8985177874565125, + "num_tokens": 397505742.0, + "step": 10903 + }, + { + "epoch": 2.0248839368616527, + "grad_norm": 1.5146247148513794, + "learning_rate": 1e-06, + "loss": 0.2839, + "mean_token_accuracy": 0.8979276418685913, + "num_tokens": 397541470.0, + "step": 10904 + }, + { + "epoch": 2.0250696378830084, + "grad_norm": 1.4498754739761353, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8956241607666016, + "num_tokens": 397584950.0, + "step": 10905 + }, + { + "epoch": 2.025255338904364, + "grad_norm": 1.609225869178772, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.891433835029602, + "num_tokens": 397622537.0, + "step": 10906 + }, + { + "epoch": 2.0254410399257194, + "grad_norm": 1.8339402675628662, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.885276198387146, + "num_tokens": 397653428.0, + "step": 10907 + }, + { + "epoch": 2.025626740947075, + "grad_norm": 1.4516667127609253, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8944542407989502, + "num_tokens": 397692909.0, + "step": 10908 + }, + { + "epoch": 2.025812441968431, + "grad_norm": 1.6252110004425049, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8945355415344238, + "num_tokens": 397728516.0, + "step": 10909 + }, + { + "epoch": 2.0259981429897866, + "grad_norm": 1.5044394731521606, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8966267108917236, + "num_tokens": 397762883.0, + "step": 10910 + }, + { + "epoch": 2.026183844011142, + "grad_norm": 1.5149835348129272, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8948476314544678, + "num_tokens": 397798696.0, + "step": 10911 + }, + { + "epoch": 2.0263695450324977, + "grad_norm": 1.785279631614685, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8723406791687012, + "num_tokens": 397831312.0, + "step": 10912 + }, + { + "epoch": 2.0265552460538534, + "grad_norm": 1.4653029441833496, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8968729376792908, + "num_tokens": 397873066.0, + "step": 10913 + }, + { + "epoch": 2.026740947075209, + "grad_norm": 1.6564382314682007, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8780951499938965, + "num_tokens": 397910650.0, + "step": 10914 + }, + { + "epoch": 2.0269266480965644, + "grad_norm": 1.5741500854492188, + "learning_rate": 1e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.8946329355239868, + "num_tokens": 397944365.0, + "step": 10915 + }, + { + "epoch": 2.02711234911792, + "grad_norm": 1.4551900625228882, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8925095796585083, + "num_tokens": 397986444.0, + "step": 10916 + }, + { + "epoch": 2.027298050139276, + "grad_norm": 1.5705735683441162, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8832172155380249, + "num_tokens": 398025299.0, + "step": 10917 + }, + { + "epoch": 2.027483751160631, + "grad_norm": 1.5898470878601074, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8962267637252808, + "num_tokens": 398063209.0, + "step": 10918 + }, + { + "epoch": 2.027669452181987, + "grad_norm": 1.6639800071716309, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8853200674057007, + "num_tokens": 398101155.0, + "step": 10919 + }, + { + "epoch": 2.0278551532033426, + "grad_norm": 1.6911869049072266, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.88401859998703, + "num_tokens": 398137516.0, + "step": 10920 + }, + { + "epoch": 2.0280408542246984, + "grad_norm": 1.63893461227417, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8857262134552002, + "num_tokens": 398173021.0, + "step": 10921 + }, + { + "epoch": 2.0282265552460537, + "grad_norm": 1.4737855195999146, + "learning_rate": 1e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.8985190391540527, + "num_tokens": 398211589.0, + "step": 10922 + }, + { + "epoch": 2.0284122562674094, + "grad_norm": 1.6898308992385864, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8877269625663757, + "num_tokens": 398244713.0, + "step": 10923 + }, + { + "epoch": 2.028597957288765, + "grad_norm": 1.401184320449829, + "learning_rate": 1e-06, + "loss": 0.2455, + "mean_token_accuracy": 0.9100183248519897, + "num_tokens": 398284283.0, + "step": 10924 + }, + { + "epoch": 2.028783658310121, + "grad_norm": 1.7103824615478516, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8703615069389343, + "num_tokens": 398324709.0, + "step": 10925 + }, + { + "epoch": 2.028969359331476, + "grad_norm": 1.4406148195266724, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.894325315952301, + "num_tokens": 398368373.0, + "step": 10926 + }, + { + "epoch": 2.029155060352832, + "grad_norm": 1.5198835134506226, + "learning_rate": 1e-06, + "loss": 0.2769, + "mean_token_accuracy": 0.8962540626525879, + "num_tokens": 398403168.0, + "step": 10927 + }, + { + "epoch": 2.0293407613741876, + "grad_norm": 1.5996750593185425, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8724607229232788, + "num_tokens": 398440346.0, + "step": 10928 + }, + { + "epoch": 2.0295264623955434, + "grad_norm": 1.7331933975219727, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8945451974868774, + "num_tokens": 398471263.0, + "step": 10929 + }, + { + "epoch": 2.0297121634168986, + "grad_norm": 1.5121039152145386, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8895792365074158, + "num_tokens": 398509697.0, + "step": 10930 + }, + { + "epoch": 2.0298978644382544, + "grad_norm": 1.5796362161636353, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8865958452224731, + "num_tokens": 398547729.0, + "step": 10931 + }, + { + "epoch": 2.03008356545961, + "grad_norm": 1.424622654914856, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8959695100784302, + "num_tokens": 398589363.0, + "step": 10932 + }, + { + "epoch": 2.030269266480966, + "grad_norm": 1.6434264183044434, + "learning_rate": 1e-06, + "loss": 0.2671, + "mean_token_accuracy": 0.9026713371276855, + "num_tokens": 398622217.0, + "step": 10933 + }, + { + "epoch": 2.030454967502321, + "grad_norm": 1.654247760772705, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8839008808135986, + "num_tokens": 398657220.0, + "step": 10934 + }, + { + "epoch": 2.030640668523677, + "grad_norm": 1.571239948272705, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8874051570892334, + "num_tokens": 398692321.0, + "step": 10935 + }, + { + "epoch": 2.0308263695450326, + "grad_norm": 1.6867889165878296, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.88132643699646, + "num_tokens": 398730031.0, + "step": 10936 + }, + { + "epoch": 2.0310120705663883, + "grad_norm": 1.529241681098938, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8859869241714478, + "num_tokens": 398770486.0, + "step": 10937 + }, + { + "epoch": 2.0311977715877436, + "grad_norm": 1.5621771812438965, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8809700012207031, + "num_tokens": 398811529.0, + "step": 10938 + }, + { + "epoch": 2.0313834726090993, + "grad_norm": 1.676683783531189, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8934949636459351, + "num_tokens": 398845545.0, + "step": 10939 + }, + { + "epoch": 2.031569173630455, + "grad_norm": 1.6174601316452026, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.882928192615509, + "num_tokens": 398883135.0, + "step": 10940 + }, + { + "epoch": 2.0317548746518104, + "grad_norm": 1.590002417564392, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8783602714538574, + "num_tokens": 398921400.0, + "step": 10941 + }, + { + "epoch": 2.031940575673166, + "grad_norm": 1.7455397844314575, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.888253927230835, + "num_tokens": 398954272.0, + "step": 10942 + }, + { + "epoch": 2.032126276694522, + "grad_norm": 1.5372599363327026, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8901735544204712, + "num_tokens": 398994608.0, + "step": 10943 + }, + { + "epoch": 2.0323119777158776, + "grad_norm": 1.6182626485824585, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8726538419723511, + "num_tokens": 399035186.0, + "step": 10944 + }, + { + "epoch": 2.032497678737233, + "grad_norm": 1.6622389554977417, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8706813454627991, + "num_tokens": 399073312.0, + "step": 10945 + }, + { + "epoch": 2.0326833797585886, + "grad_norm": 1.559767484664917, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8900465965270996, + "num_tokens": 399112597.0, + "step": 10946 + }, + { + "epoch": 2.0328690807799443, + "grad_norm": 1.5252317190170288, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8925313949584961, + "num_tokens": 399150156.0, + "step": 10947 + }, + { + "epoch": 2.0330547818013, + "grad_norm": 1.6951854228973389, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.887549877166748, + "num_tokens": 399185163.0, + "step": 10948 + }, + { + "epoch": 2.0332404828226553, + "grad_norm": 1.7362523078918457, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8957103490829468, + "num_tokens": 399221305.0, + "step": 10949 + }, + { + "epoch": 2.033426183844011, + "grad_norm": 1.7150747776031494, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8801335096359253, + "num_tokens": 399257922.0, + "step": 10950 + }, + { + "epoch": 2.033611884865367, + "grad_norm": 1.8422988653182983, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8729655742645264, + "num_tokens": 399287586.0, + "step": 10951 + }, + { + "epoch": 2.0337975858867225, + "grad_norm": 1.584349513053894, + "learning_rate": 1e-06, + "loss": 0.2518, + "mean_token_accuracy": 0.9074863791465759, + "num_tokens": 399321302.0, + "step": 10952 + }, + { + "epoch": 2.033983286908078, + "grad_norm": 1.8091245889663696, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8793909549713135, + "num_tokens": 399351566.0, + "step": 10953 + }, + { + "epoch": 2.0341689879294336, + "grad_norm": 1.7720173597335815, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8912228345870972, + "num_tokens": 399381680.0, + "step": 10954 + }, + { + "epoch": 2.0343546889507893, + "grad_norm": 1.5991315841674805, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8952990770339966, + "num_tokens": 399417307.0, + "step": 10955 + }, + { + "epoch": 2.034540389972145, + "grad_norm": 1.6210325956344604, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8853166699409485, + "num_tokens": 399454025.0, + "step": 10956 + }, + { + "epoch": 2.0347260909935003, + "grad_norm": 1.657232642173767, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8881412148475647, + "num_tokens": 399490740.0, + "step": 10957 + }, + { + "epoch": 2.034911792014856, + "grad_norm": 1.629462480545044, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.891577422618866, + "num_tokens": 399527770.0, + "step": 10958 + }, + { + "epoch": 2.035097493036212, + "grad_norm": 1.7247370481491089, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8953666687011719, + "num_tokens": 399558129.0, + "step": 10959 + }, + { + "epoch": 2.0352831940575675, + "grad_norm": 1.4738198518753052, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.887744665145874, + "num_tokens": 399600298.0, + "step": 10960 + }, + { + "epoch": 2.035468895078923, + "grad_norm": 1.628100037574768, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8853617906570435, + "num_tokens": 399636628.0, + "step": 10961 + }, + { + "epoch": 2.0356545961002785, + "grad_norm": 1.8640896081924438, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8921430110931396, + "num_tokens": 399667364.0, + "step": 10962 + }, + { + "epoch": 2.0358402971216343, + "grad_norm": 1.5711336135864258, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.888760507106781, + "num_tokens": 399707835.0, + "step": 10963 + }, + { + "epoch": 2.0360259981429896, + "grad_norm": 1.645833969116211, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8920798897743225, + "num_tokens": 399743225.0, + "step": 10964 + }, + { + "epoch": 2.0362116991643453, + "grad_norm": 1.6003870964050293, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8954317569732666, + "num_tokens": 399780799.0, + "step": 10965 + }, + { + "epoch": 2.036397400185701, + "grad_norm": 1.5038727521896362, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8925351500511169, + "num_tokens": 399819045.0, + "step": 10966 + }, + { + "epoch": 2.0365831012070568, + "grad_norm": 1.5475025177001953, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8921195268630981, + "num_tokens": 399858110.0, + "step": 10967 + }, + { + "epoch": 2.036768802228412, + "grad_norm": 1.6535875797271729, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8927195072174072, + "num_tokens": 399896947.0, + "step": 10968 + }, + { + "epoch": 2.036954503249768, + "grad_norm": 1.5924986600875854, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8904110789299011, + "num_tokens": 399932834.0, + "step": 10969 + }, + { + "epoch": 2.0371402042711235, + "grad_norm": 1.6712342500686646, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8872038125991821, + "num_tokens": 399967890.0, + "step": 10970 + }, + { + "epoch": 2.0373259052924793, + "grad_norm": 1.6112550497055054, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8917126059532166, + "num_tokens": 400002882.0, + "step": 10971 + }, + { + "epoch": 2.0375116063138345, + "grad_norm": 1.425799012184143, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8937174081802368, + "num_tokens": 400042004.0, + "step": 10972 + }, + { + "epoch": 2.0376973073351903, + "grad_norm": 1.649687647819519, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8864485025405884, + "num_tokens": 400079103.0, + "step": 10973 + }, + { + "epoch": 2.037883008356546, + "grad_norm": 1.5908191204071045, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8834273219108582, + "num_tokens": 400115705.0, + "step": 10974 + }, + { + "epoch": 2.0380687093779017, + "grad_norm": 1.6579681634902954, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8874198794364929, + "num_tokens": 400152173.0, + "step": 10975 + }, + { + "epoch": 2.038254410399257, + "grad_norm": 1.6136704683303833, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8917415142059326, + "num_tokens": 400187562.0, + "step": 10976 + }, + { + "epoch": 2.0384401114206128, + "grad_norm": 1.5761202573776245, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8862806558609009, + "num_tokens": 400227721.0, + "step": 10977 + }, + { + "epoch": 2.0386258124419685, + "grad_norm": 1.6871603727340698, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8898148536682129, + "num_tokens": 400264019.0, + "step": 10978 + }, + { + "epoch": 2.0388115134633242, + "grad_norm": 1.5070395469665527, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8878651261329651, + "num_tokens": 400303058.0, + "step": 10979 + }, + { + "epoch": 2.0389972144846795, + "grad_norm": 1.7879629135131836, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8818907141685486, + "num_tokens": 400335639.0, + "step": 10980 + }, + { + "epoch": 2.0391829155060353, + "grad_norm": 1.5891019105911255, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8936710357666016, + "num_tokens": 400370369.0, + "step": 10981 + }, + { + "epoch": 2.039368616527391, + "grad_norm": 1.7197047472000122, + "learning_rate": 1e-06, + "loss": 0.2604, + "mean_token_accuracy": 0.9018423557281494, + "num_tokens": 400401837.0, + "step": 10982 + }, + { + "epoch": 2.0395543175487467, + "grad_norm": 1.5502345561981201, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8882367014884949, + "num_tokens": 400438994.0, + "step": 10983 + }, + { + "epoch": 2.039740018570102, + "grad_norm": 1.6241375207901, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8862642049789429, + "num_tokens": 400475737.0, + "step": 10984 + }, + { + "epoch": 2.0399257195914577, + "grad_norm": 1.6551166772842407, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8879189491271973, + "num_tokens": 400511967.0, + "step": 10985 + }, + { + "epoch": 2.0401114206128135, + "grad_norm": 1.6148900985717773, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8990840911865234, + "num_tokens": 400546495.0, + "step": 10986 + }, + { + "epoch": 2.040297121634169, + "grad_norm": 1.6080282926559448, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.876931369304657, + "num_tokens": 400584314.0, + "step": 10987 + }, + { + "epoch": 2.0404828226555245, + "grad_norm": 1.6867116689682007, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8868581056594849, + "num_tokens": 400618003.0, + "step": 10988 + }, + { + "epoch": 2.0406685236768802, + "grad_norm": 1.6174216270446777, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8806503415107727, + "num_tokens": 400656058.0, + "step": 10989 + }, + { + "epoch": 2.040854224698236, + "grad_norm": 1.6907137632369995, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8849350810050964, + "num_tokens": 400692486.0, + "step": 10990 + }, + { + "epoch": 2.0410399257195913, + "grad_norm": 1.5689512491226196, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8944456577301025, + "num_tokens": 400729317.0, + "step": 10991 + }, + { + "epoch": 2.041225626740947, + "grad_norm": 1.7156481742858887, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8682513236999512, + "num_tokens": 400764982.0, + "step": 10992 + }, + { + "epoch": 2.0414113277623027, + "grad_norm": 1.516626000404358, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8933950662612915, + "num_tokens": 400801251.0, + "step": 10993 + }, + { + "epoch": 2.0415970287836585, + "grad_norm": 1.5049102306365967, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8958033323287964, + "num_tokens": 400839422.0, + "step": 10994 + }, + { + "epoch": 2.0417827298050137, + "grad_norm": 1.8365076780319214, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8879399299621582, + "num_tokens": 400869440.0, + "step": 10995 + }, + { + "epoch": 2.0419684308263695, + "grad_norm": 1.5820286273956299, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8731359243392944, + "num_tokens": 400908693.0, + "step": 10996 + }, + { + "epoch": 2.042154131847725, + "grad_norm": 1.4705653190612793, + "learning_rate": 1e-06, + "loss": 0.272, + "mean_token_accuracy": 0.9020279049873352, + "num_tokens": 400945735.0, + "step": 10997 + }, + { + "epoch": 2.042339832869081, + "grad_norm": 1.650443434715271, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8786280155181885, + "num_tokens": 400983293.0, + "step": 10998 + }, + { + "epoch": 2.0425255338904362, + "grad_norm": 1.5653102397918701, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8776335716247559, + "num_tokens": 401021097.0, + "step": 10999 + }, + { + "epoch": 2.042711234911792, + "grad_norm": 1.5991926193237305, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8964586853981018, + "num_tokens": 401055067.0, + "step": 11000 + }, + { + "epoch": 2.0428969359331477, + "grad_norm": 1.5076007843017578, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8935904502868652, + "num_tokens": 401095773.0, + "step": 11001 + }, + { + "epoch": 2.0430826369545034, + "grad_norm": 1.498167872428894, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8924453854560852, + "num_tokens": 401135070.0, + "step": 11002 + }, + { + "epoch": 2.0432683379758587, + "grad_norm": 1.525833010673523, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.893622875213623, + "num_tokens": 401170825.0, + "step": 11003 + }, + { + "epoch": 2.0434540389972145, + "grad_norm": 1.610005497932434, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8841822743415833, + "num_tokens": 401209614.0, + "step": 11004 + }, + { + "epoch": 2.04363974001857, + "grad_norm": 1.7729939222335815, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8843891620635986, + "num_tokens": 401242629.0, + "step": 11005 + }, + { + "epoch": 2.043825441039926, + "grad_norm": 1.533150315284729, + "learning_rate": 1e-06, + "loss": 0.2746, + "mean_token_accuracy": 0.9009150266647339, + "num_tokens": 401280541.0, + "step": 11006 + }, + { + "epoch": 2.044011142061281, + "grad_norm": 1.7409714460372925, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8937478065490723, + "num_tokens": 401311673.0, + "step": 11007 + }, + { + "epoch": 2.044196843082637, + "grad_norm": 1.599294900894165, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8892580270767212, + "num_tokens": 401348226.0, + "step": 11008 + }, + { + "epoch": 2.0443825441039927, + "grad_norm": 1.6783018112182617, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8811336755752563, + "num_tokens": 401382513.0, + "step": 11009 + }, + { + "epoch": 2.0445682451253484, + "grad_norm": 1.6530228853225708, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8828645944595337, + "num_tokens": 401419416.0, + "step": 11010 + }, + { + "epoch": 2.0447539461467037, + "grad_norm": 1.756669044494629, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8791195154190063, + "num_tokens": 401456507.0, + "step": 11011 + }, + { + "epoch": 2.0449396471680594, + "grad_norm": 1.6729637384414673, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8914788961410522, + "num_tokens": 401493171.0, + "step": 11012 + }, + { + "epoch": 2.045125348189415, + "grad_norm": 1.6191521883010864, + "learning_rate": 1e-06, + "loss": 0.2759, + "mean_token_accuracy": 0.9011662006378174, + "num_tokens": 401527890.0, + "step": 11013 + }, + { + "epoch": 2.0453110492107704, + "grad_norm": 1.737109899520874, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8798948526382446, + "num_tokens": 401563570.0, + "step": 11014 + }, + { + "epoch": 2.045496750232126, + "grad_norm": 1.5707870721817017, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8931407928466797, + "num_tokens": 401600297.0, + "step": 11015 + }, + { + "epoch": 2.045682451253482, + "grad_norm": 1.5819169282913208, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8737301826477051, + "num_tokens": 401640911.0, + "step": 11016 + }, + { + "epoch": 2.0458681522748376, + "grad_norm": 1.58056640625, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8796850442886353, + "num_tokens": 401681535.0, + "step": 11017 + }, + { + "epoch": 2.046053853296193, + "grad_norm": 1.8041008710861206, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8902584314346313, + "num_tokens": 401712320.0, + "step": 11018 + }, + { + "epoch": 2.0462395543175487, + "grad_norm": 1.676290512084961, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.883083164691925, + "num_tokens": 401747887.0, + "step": 11019 + }, + { + "epoch": 2.0464252553389044, + "grad_norm": 1.716623306274414, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8702894449234009, + "num_tokens": 401782181.0, + "step": 11020 + }, + { + "epoch": 2.04661095636026, + "grad_norm": 1.6614303588867188, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8929224014282227, + "num_tokens": 401816881.0, + "step": 11021 + }, + { + "epoch": 2.0467966573816154, + "grad_norm": 1.4786458015441895, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.891448974609375, + "num_tokens": 401858397.0, + "step": 11022 + }, + { + "epoch": 2.046982358402971, + "grad_norm": 1.4749091863632202, + "learning_rate": 1e-06, + "loss": 0.2651, + "mean_token_accuracy": 0.9026604890823364, + "num_tokens": 401897233.0, + "step": 11023 + }, + { + "epoch": 2.047168059424327, + "grad_norm": 1.7298568487167358, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8778814673423767, + "num_tokens": 401931686.0, + "step": 11024 + }, + { + "epoch": 2.0473537604456826, + "grad_norm": 1.731704592704773, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8808584213256836, + "num_tokens": 401966007.0, + "step": 11025 + }, + { + "epoch": 2.047539461467038, + "grad_norm": 1.6784297227859497, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8875522613525391, + "num_tokens": 401998777.0, + "step": 11026 + }, + { + "epoch": 2.0477251624883936, + "grad_norm": 1.6171411275863647, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8831042051315308, + "num_tokens": 402039091.0, + "step": 11027 + }, + { + "epoch": 2.0479108635097494, + "grad_norm": 1.7971537113189697, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8852789998054504, + "num_tokens": 402069368.0, + "step": 11028 + }, + { + "epoch": 2.048096564531105, + "grad_norm": 1.5561412572860718, + "learning_rate": 1e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9016631245613098, + "num_tokens": 402106418.0, + "step": 11029 + }, + { + "epoch": 2.0482822655524604, + "grad_norm": 1.69640052318573, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.896953821182251, + "num_tokens": 402141096.0, + "step": 11030 + }, + { + "epoch": 2.048467966573816, + "grad_norm": 1.6258797645568848, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8699894547462463, + "num_tokens": 402177026.0, + "step": 11031 + }, + { + "epoch": 2.048653667595172, + "grad_norm": 1.556321382522583, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8941174745559692, + "num_tokens": 402214785.0, + "step": 11032 + }, + { + "epoch": 2.0488393686165276, + "grad_norm": 1.5372304916381836, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8902230858802795, + "num_tokens": 402250319.0, + "step": 11033 + }, + { + "epoch": 2.049025069637883, + "grad_norm": 1.569303274154663, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8894104957580566, + "num_tokens": 402286605.0, + "step": 11034 + }, + { + "epoch": 2.0492107706592386, + "grad_norm": 1.6398438215255737, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8817088603973389, + "num_tokens": 402325660.0, + "step": 11035 + }, + { + "epoch": 2.0493964716805944, + "grad_norm": 1.584834098815918, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8718239068984985, + "num_tokens": 402366991.0, + "step": 11036 + }, + { + "epoch": 2.0495821727019496, + "grad_norm": 1.4542814493179321, + "learning_rate": 1e-06, + "loss": 0.2695, + "mean_token_accuracy": 0.9011942744255066, + "num_tokens": 402405767.0, + "step": 11037 + }, + { + "epoch": 2.0497678737233054, + "grad_norm": 1.6358247995376587, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8891026377677917, + "num_tokens": 402443182.0, + "step": 11038 + }, + { + "epoch": 2.049953574744661, + "grad_norm": 1.5675731897354126, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8809807300567627, + "num_tokens": 402482182.0, + "step": 11039 + }, + { + "epoch": 2.050139275766017, + "grad_norm": 1.7931365966796875, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8755983710289001, + "num_tokens": 402515247.0, + "step": 11040 + }, + { + "epoch": 2.050324976787372, + "grad_norm": 1.5401537418365479, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8970879912376404, + "num_tokens": 402552712.0, + "step": 11041 + }, + { + "epoch": 2.050510677808728, + "grad_norm": 1.6531907320022583, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8841887712478638, + "num_tokens": 402588179.0, + "step": 11042 + }, + { + "epoch": 2.0506963788300836, + "grad_norm": 1.5990866422653198, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8944785594940186, + "num_tokens": 402622648.0, + "step": 11043 + }, + { + "epoch": 2.0508820798514393, + "grad_norm": 1.6184285879135132, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8866530656814575, + "num_tokens": 402655421.0, + "step": 11044 + }, + { + "epoch": 2.0510677808727946, + "grad_norm": 1.4185038805007935, + "learning_rate": 1e-06, + "loss": 0.2669, + "mean_token_accuracy": 0.9040311574935913, + "num_tokens": 402693788.0, + "step": 11045 + }, + { + "epoch": 2.0512534818941504, + "grad_norm": 1.555708885192871, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8859129548072815, + "num_tokens": 402730670.0, + "step": 11046 + }, + { + "epoch": 2.051439182915506, + "grad_norm": 1.7334368228912354, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8870788812637329, + "num_tokens": 402764836.0, + "step": 11047 + }, + { + "epoch": 2.051624883936862, + "grad_norm": 1.5647087097167969, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8877013325691223, + "num_tokens": 402804757.0, + "step": 11048 + }, + { + "epoch": 2.051810584958217, + "grad_norm": 1.605433702468872, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8991915583610535, + "num_tokens": 402836995.0, + "step": 11049 + }, + { + "epoch": 2.051996285979573, + "grad_norm": 1.5476611852645874, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8938502073287964, + "num_tokens": 402873789.0, + "step": 11050 + }, + { + "epoch": 2.0521819870009286, + "grad_norm": 1.528005838394165, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.880452036857605, + "num_tokens": 402913310.0, + "step": 11051 + }, + { + "epoch": 2.0523676880222843, + "grad_norm": 1.5837388038635254, + "learning_rate": 1e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.9021995067596436, + "num_tokens": 402949025.0, + "step": 11052 + }, + { + "epoch": 2.0525533890436396, + "grad_norm": 1.750038981437683, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.884841799736023, + "num_tokens": 402980859.0, + "step": 11053 + }, + { + "epoch": 2.0527390900649953, + "grad_norm": 1.4757740497589111, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8852607607841492, + "num_tokens": 403022232.0, + "step": 11054 + }, + { + "epoch": 2.052924791086351, + "grad_norm": 1.7833374738693237, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8849369883537292, + "num_tokens": 403051935.0, + "step": 11055 + }, + { + "epoch": 2.053110492107707, + "grad_norm": 1.6750446557998657, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8853458166122437, + "num_tokens": 403086998.0, + "step": 11056 + }, + { + "epoch": 2.053296193129062, + "grad_norm": 1.5914312601089478, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8837096691131592, + "num_tokens": 403122259.0, + "step": 11057 + }, + { + "epoch": 2.053481894150418, + "grad_norm": 1.494443655014038, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8904949426651001, + "num_tokens": 403163956.0, + "step": 11058 + }, + { + "epoch": 2.0536675951717736, + "grad_norm": 1.6349462270736694, + "learning_rate": 1e-06, + "loss": 0.2542, + "mean_token_accuracy": 0.9043453335762024, + "num_tokens": 403194136.0, + "step": 11059 + }, + { + "epoch": 2.053853296193129, + "grad_norm": 1.623597264289856, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.8983523845672607, + "num_tokens": 403231216.0, + "step": 11060 + }, + { + "epoch": 2.0540389972144846, + "grad_norm": 1.797034502029419, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8923027515411377, + "num_tokens": 403262171.0, + "step": 11061 + }, + { + "epoch": 2.0542246982358403, + "grad_norm": 1.6585477590560913, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8901085257530212, + "num_tokens": 403296234.0, + "step": 11062 + }, + { + "epoch": 2.054410399257196, + "grad_norm": 1.5372560024261475, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8854848742485046, + "num_tokens": 403337203.0, + "step": 11063 + }, + { + "epoch": 2.0545961002785513, + "grad_norm": 1.4162421226501465, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8944202661514282, + "num_tokens": 403379045.0, + "step": 11064 + }, + { + "epoch": 2.054781801299907, + "grad_norm": 1.7757760286331177, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8797141909599304, + "num_tokens": 403409340.0, + "step": 11065 + }, + { + "epoch": 2.054967502321263, + "grad_norm": 1.5920361280441284, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.877842128276825, + "num_tokens": 403451083.0, + "step": 11066 + }, + { + "epoch": 2.0551532033426185, + "grad_norm": 1.5754430294036865, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8911871910095215, + "num_tokens": 403485744.0, + "step": 11067 + }, + { + "epoch": 2.055338904363974, + "grad_norm": 1.427693247795105, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8990851640701294, + "num_tokens": 403523395.0, + "step": 11068 + }, + { + "epoch": 2.0555246053853296, + "grad_norm": 1.7560770511627197, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8637388348579407, + "num_tokens": 403556646.0, + "step": 11069 + }, + { + "epoch": 2.0557103064066853, + "grad_norm": 1.5168356895446777, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8963544964790344, + "num_tokens": 403593782.0, + "step": 11070 + }, + { + "epoch": 2.055896007428041, + "grad_norm": 1.5663119554519653, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8834123611450195, + "num_tokens": 403633286.0, + "step": 11071 + }, + { + "epoch": 2.0560817084493963, + "grad_norm": 1.708177924156189, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.884103000164032, + "num_tokens": 403669935.0, + "step": 11072 + }, + { + "epoch": 2.056267409470752, + "grad_norm": 1.5132670402526855, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8825082778930664, + "num_tokens": 403711545.0, + "step": 11073 + }, + { + "epoch": 2.0564531104921078, + "grad_norm": 1.7347458600997925, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8781464695930481, + "num_tokens": 403745714.0, + "step": 11074 + }, + { + "epoch": 2.0566388115134635, + "grad_norm": 1.6463209390640259, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8848309516906738, + "num_tokens": 403778449.0, + "step": 11075 + }, + { + "epoch": 2.056824512534819, + "grad_norm": 1.817678689956665, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8944814205169678, + "num_tokens": 403809769.0, + "step": 11076 + }, + { + "epoch": 2.0570102135561745, + "grad_norm": 1.5080097913742065, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8729389309883118, + "num_tokens": 403855650.0, + "step": 11077 + }, + { + "epoch": 2.0571959145775303, + "grad_norm": 1.6247806549072266, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8958560228347778, + "num_tokens": 403890010.0, + "step": 11078 + }, + { + "epoch": 2.057381615598886, + "grad_norm": 1.453977346420288, + "learning_rate": 1e-06, + "loss": 0.2743, + "mean_token_accuracy": 0.8983099460601807, + "num_tokens": 403929908.0, + "step": 11079 + }, + { + "epoch": 2.0575673166202413, + "grad_norm": 1.6027960777282715, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8835445046424866, + "num_tokens": 403966298.0, + "step": 11080 + }, + { + "epoch": 2.057753017641597, + "grad_norm": 1.6762182712554932, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8751188516616821, + "num_tokens": 404003123.0, + "step": 11081 + }, + { + "epoch": 2.0579387186629527, + "grad_norm": 1.6652488708496094, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8837353587150574, + "num_tokens": 404036166.0, + "step": 11082 + }, + { + "epoch": 2.0581244196843085, + "grad_norm": 1.5315024852752686, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.887365460395813, + "num_tokens": 404073675.0, + "step": 11083 + }, + { + "epoch": 2.0583101207056638, + "grad_norm": 1.782092571258545, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8770794868469238, + "num_tokens": 404110144.0, + "step": 11084 + }, + { + "epoch": 2.0584958217270195, + "grad_norm": 1.6419568061828613, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8741748332977295, + "num_tokens": 404147136.0, + "step": 11085 + }, + { + "epoch": 2.0586815227483752, + "grad_norm": 1.4222664833068848, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8842144012451172, + "num_tokens": 404189976.0, + "step": 11086 + }, + { + "epoch": 2.0588672237697305, + "grad_norm": 1.6045706272125244, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8890166282653809, + "num_tokens": 404228253.0, + "step": 11087 + }, + { + "epoch": 2.0590529247910863, + "grad_norm": 1.5408309698104858, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8925545811653137, + "num_tokens": 404265682.0, + "step": 11088 + }, + { + "epoch": 2.059238625812442, + "grad_norm": 1.518203854560852, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8961747884750366, + "num_tokens": 404301175.0, + "step": 11089 + }, + { + "epoch": 2.0594243268337977, + "grad_norm": 1.5001037120819092, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8799911737442017, + "num_tokens": 404339333.0, + "step": 11090 + }, + { + "epoch": 2.059610027855153, + "grad_norm": 1.5581722259521484, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8810278177261353, + "num_tokens": 404378180.0, + "step": 11091 + }, + { + "epoch": 2.0597957288765087, + "grad_norm": 1.6097415685653687, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8821415901184082, + "num_tokens": 404416123.0, + "step": 11092 + }, + { + "epoch": 2.0599814298978645, + "grad_norm": 1.6522208452224731, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8881814479827881, + "num_tokens": 404449272.0, + "step": 11093 + }, + { + "epoch": 2.06016713091922, + "grad_norm": 1.5568894147872925, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8865184187889099, + "num_tokens": 404488421.0, + "step": 11094 + }, + { + "epoch": 2.0603528319405755, + "grad_norm": 1.6338526010513306, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8846738338470459, + "num_tokens": 404523428.0, + "step": 11095 + }, + { + "epoch": 2.0605385329619312, + "grad_norm": 1.5900217294692993, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8959879875183105, + "num_tokens": 404557955.0, + "step": 11096 + }, + { + "epoch": 2.060724233983287, + "grad_norm": 1.583734393119812, + "learning_rate": 1e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.899117112159729, + "num_tokens": 404594264.0, + "step": 11097 + }, + { + "epoch": 2.0609099350046427, + "grad_norm": 1.6154383420944214, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8823574781417847, + "num_tokens": 404631927.0, + "step": 11098 + }, + { + "epoch": 2.061095636025998, + "grad_norm": 1.5767967700958252, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8924384713172913, + "num_tokens": 404672042.0, + "step": 11099 + }, + { + "epoch": 2.0612813370473537, + "grad_norm": 1.6601594686508179, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8815439939498901, + "num_tokens": 404711782.0, + "step": 11100 + }, + { + "epoch": 2.0614670380687095, + "grad_norm": 1.8085702657699585, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8671731948852539, + "num_tokens": 404745788.0, + "step": 11101 + }, + { + "epoch": 2.061652739090065, + "grad_norm": 1.7252905368804932, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8852753043174744, + "num_tokens": 404779101.0, + "step": 11102 + }, + { + "epoch": 2.0618384401114205, + "grad_norm": 1.6995065212249756, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8847663402557373, + "num_tokens": 404815521.0, + "step": 11103 + }, + { + "epoch": 2.062024141132776, + "grad_norm": 1.7089004516601562, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8826100826263428, + "num_tokens": 404851020.0, + "step": 11104 + }, + { + "epoch": 2.062209842154132, + "grad_norm": 1.585217833518982, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8930886387825012, + "num_tokens": 404888934.0, + "step": 11105 + }, + { + "epoch": 2.0623955431754877, + "grad_norm": 1.828010082244873, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8745301961898804, + "num_tokens": 404921825.0, + "step": 11106 + }, + { + "epoch": 2.062581244196843, + "grad_norm": 1.4071320295333862, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8891721963882446, + "num_tokens": 404965191.0, + "step": 11107 + }, + { + "epoch": 2.0627669452181987, + "grad_norm": 1.4984296560287476, + "learning_rate": 1e-06, + "loss": 0.2748, + "mean_token_accuracy": 0.8977474570274353, + "num_tokens": 405004720.0, + "step": 11108 + }, + { + "epoch": 2.0629526462395544, + "grad_norm": 1.5420985221862793, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8864251971244812, + "num_tokens": 405050003.0, + "step": 11109 + }, + { + "epoch": 2.0631383472609097, + "grad_norm": 1.4855352640151978, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8909172415733337, + "num_tokens": 405090078.0, + "step": 11110 + }, + { + "epoch": 2.0633240482822655, + "grad_norm": 1.640874981880188, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8714503645896912, + "num_tokens": 405129488.0, + "step": 11111 + }, + { + "epoch": 2.063509749303621, + "grad_norm": 1.5655418634414673, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8929200172424316, + "num_tokens": 405165935.0, + "step": 11112 + }, + { + "epoch": 2.063695450324977, + "grad_norm": 1.6287552118301392, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8795475959777832, + "num_tokens": 405206689.0, + "step": 11113 + }, + { + "epoch": 2.063881151346332, + "grad_norm": 1.608350157737732, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8847581148147583, + "num_tokens": 405243741.0, + "step": 11114 + }, + { + "epoch": 2.064066852367688, + "grad_norm": 1.7788012027740479, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8637374639511108, + "num_tokens": 405278360.0, + "step": 11115 + }, + { + "epoch": 2.0642525533890437, + "grad_norm": 1.569543719291687, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8809628486633301, + "num_tokens": 405319712.0, + "step": 11116 + }, + { + "epoch": 2.0644382544103994, + "grad_norm": 1.5461453199386597, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.9004616737365723, + "num_tokens": 405356966.0, + "step": 11117 + }, + { + "epoch": 2.0646239554317547, + "grad_norm": 1.8135275840759277, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8690273761749268, + "num_tokens": 405389789.0, + "step": 11118 + }, + { + "epoch": 2.0648096564531104, + "grad_norm": 1.82096529006958, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8814585208892822, + "num_tokens": 405421871.0, + "step": 11119 + }, + { + "epoch": 2.064995357474466, + "grad_norm": 1.5183545351028442, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8921419382095337, + "num_tokens": 405461549.0, + "step": 11120 + }, + { + "epoch": 2.065181058495822, + "grad_norm": 1.5091454982757568, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8898409605026245, + "num_tokens": 405499202.0, + "step": 11121 + }, + { + "epoch": 2.065366759517177, + "grad_norm": 1.5700538158416748, + "learning_rate": 1e-06, + "loss": 0.2662, + "mean_token_accuracy": 0.9014238715171814, + "num_tokens": 405533265.0, + "step": 11122 + }, + { + "epoch": 2.065552460538533, + "grad_norm": 1.6336458921432495, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8862869143486023, + "num_tokens": 405567659.0, + "step": 11123 + }, + { + "epoch": 2.0657381615598887, + "grad_norm": 1.5395172834396362, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8876215219497681, + "num_tokens": 405605408.0, + "step": 11124 + }, + { + "epoch": 2.0659238625812444, + "grad_norm": 1.5789176225662231, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.876251220703125, + "num_tokens": 405643246.0, + "step": 11125 + }, + { + "epoch": 2.0661095636025997, + "grad_norm": 1.5945466756820679, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8987166881561279, + "num_tokens": 405677505.0, + "step": 11126 + }, + { + "epoch": 2.0662952646239554, + "grad_norm": 1.7692240476608276, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8917528390884399, + "num_tokens": 405710663.0, + "step": 11127 + }, + { + "epoch": 2.066480965645311, + "grad_norm": 1.6961339712142944, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8804413676261902, + "num_tokens": 405745575.0, + "step": 11128 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 1.5141853094100952, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8806847333908081, + "num_tokens": 405785641.0, + "step": 11129 + }, + { + "epoch": 2.066852367688022, + "grad_norm": 1.643648386001587, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8898240923881531, + "num_tokens": 405817392.0, + "step": 11130 + }, + { + "epoch": 2.067038068709378, + "grad_norm": 1.5020601749420166, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8881747722625732, + "num_tokens": 405856609.0, + "step": 11131 + }, + { + "epoch": 2.0672237697307336, + "grad_norm": 1.615955114364624, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.897404670715332, + "num_tokens": 405889404.0, + "step": 11132 + }, + { + "epoch": 2.0674094707520894, + "grad_norm": 1.7906608581542969, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8801560401916504, + "num_tokens": 405920756.0, + "step": 11133 + }, + { + "epoch": 2.0675951717734447, + "grad_norm": 1.7412785291671753, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8868296146392822, + "num_tokens": 405954257.0, + "step": 11134 + }, + { + "epoch": 2.0677808727948004, + "grad_norm": 1.6501808166503906, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8963536024093628, + "num_tokens": 405989676.0, + "step": 11135 + }, + { + "epoch": 2.067966573816156, + "grad_norm": 1.448046326637268, + "learning_rate": 1e-06, + "loss": 0.2757, + "mean_token_accuracy": 0.899591326713562, + "num_tokens": 406029799.0, + "step": 11136 + }, + { + "epoch": 2.0681522748375114, + "grad_norm": 1.6019173860549927, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8888182640075684, + "num_tokens": 406067580.0, + "step": 11137 + }, + { + "epoch": 2.068337975858867, + "grad_norm": 1.7403124570846558, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8784725069999695, + "num_tokens": 406101035.0, + "step": 11138 + }, + { + "epoch": 2.068523676880223, + "grad_norm": 1.7170499563217163, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8915771245956421, + "num_tokens": 406133560.0, + "step": 11139 + }, + { + "epoch": 2.0687093779015786, + "grad_norm": 1.5844601392745972, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8941083550453186, + "num_tokens": 406169300.0, + "step": 11140 + }, + { + "epoch": 2.068895078922934, + "grad_norm": 1.6861544847488403, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8861684799194336, + "num_tokens": 406202295.0, + "step": 11141 + }, + { + "epoch": 2.0690807799442896, + "grad_norm": 1.5582504272460938, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8851675987243652, + "num_tokens": 406242953.0, + "step": 11142 + }, + { + "epoch": 2.0692664809656454, + "grad_norm": 1.5797029733657837, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8715577125549316, + "num_tokens": 406281706.0, + "step": 11143 + }, + { + "epoch": 2.069452181987001, + "grad_norm": 1.7184725999832153, + "learning_rate": 1e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.8988103866577148, + "num_tokens": 406312040.0, + "step": 11144 + }, + { + "epoch": 2.0696378830083564, + "grad_norm": 1.6630663871765137, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8806053400039673, + "num_tokens": 406347726.0, + "step": 11145 + }, + { + "epoch": 2.069823584029712, + "grad_norm": 1.8031562566757202, + "learning_rate": 1e-06, + "loss": 0.2759, + "mean_token_accuracy": 0.9001575112342834, + "num_tokens": 406378700.0, + "step": 11146 + }, + { + "epoch": 2.070009285051068, + "grad_norm": 1.577062964439392, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8923755884170532, + "num_tokens": 406417438.0, + "step": 11147 + }, + { + "epoch": 2.0701949860724236, + "grad_norm": 1.6280348300933838, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8926030397415161, + "num_tokens": 406451434.0, + "step": 11148 + }, + { + "epoch": 2.070380687093779, + "grad_norm": 1.3604744672775269, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.9009035229682922, + "num_tokens": 406496818.0, + "step": 11149 + }, + { + "epoch": 2.0705663881151346, + "grad_norm": 1.5962128639221191, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8926798105239868, + "num_tokens": 406533477.0, + "step": 11150 + }, + { + "epoch": 2.0707520891364903, + "grad_norm": 1.8233062028884888, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8806719183921814, + "num_tokens": 406565719.0, + "step": 11151 + }, + { + "epoch": 2.070937790157846, + "grad_norm": 1.6682413816452026, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8922688364982605, + "num_tokens": 406599793.0, + "step": 11152 + }, + { + "epoch": 2.0711234911792014, + "grad_norm": 1.5760116577148438, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8835379481315613, + "num_tokens": 406634981.0, + "step": 11153 + }, + { + "epoch": 2.071309192200557, + "grad_norm": 1.609602451324463, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8940050601959229, + "num_tokens": 406670960.0, + "step": 11154 + }, + { + "epoch": 2.071494893221913, + "grad_norm": 1.698731541633606, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8760464191436768, + "num_tokens": 406706798.0, + "step": 11155 + }, + { + "epoch": 2.0716805942432686, + "grad_norm": 1.6135269403457642, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8777408599853516, + "num_tokens": 406747045.0, + "step": 11156 + }, + { + "epoch": 2.071866295264624, + "grad_norm": 1.6465067863464355, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8814512491226196, + "num_tokens": 406784983.0, + "step": 11157 + }, + { + "epoch": 2.0720519962859796, + "grad_norm": 1.6052607297897339, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8886991143226624, + "num_tokens": 406820074.0, + "step": 11158 + }, + { + "epoch": 2.0722376973073353, + "grad_norm": 1.7674115896224976, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8842552900314331, + "num_tokens": 406852574.0, + "step": 11159 + }, + { + "epoch": 2.0724233983286906, + "grad_norm": 1.601393222808838, + "learning_rate": 1e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.9014548063278198, + "num_tokens": 406887803.0, + "step": 11160 + }, + { + "epoch": 2.0726090993500463, + "grad_norm": 1.5239689350128174, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8828568458557129, + "num_tokens": 406928388.0, + "step": 11161 + }, + { + "epoch": 2.072794800371402, + "grad_norm": 1.6427032947540283, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.885308027267456, + "num_tokens": 406964500.0, + "step": 11162 + }, + { + "epoch": 2.072980501392758, + "grad_norm": 1.577830195426941, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.880464494228363, + "num_tokens": 407007051.0, + "step": 11163 + }, + { + "epoch": 2.073166202414113, + "grad_norm": 1.5921427011489868, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8984404802322388, + "num_tokens": 407043762.0, + "step": 11164 + }, + { + "epoch": 2.073351903435469, + "grad_norm": 1.6337491273880005, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8752941489219666, + "num_tokens": 407081417.0, + "step": 11165 + }, + { + "epoch": 2.0735376044568246, + "grad_norm": 1.7890524864196777, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8747794032096863, + "num_tokens": 407117711.0, + "step": 11166 + }, + { + "epoch": 2.0737233054781803, + "grad_norm": 1.6654911041259766, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8840241432189941, + "num_tokens": 407156827.0, + "step": 11167 + }, + { + "epoch": 2.0739090064995356, + "grad_norm": 1.5359269380569458, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8898470997810364, + "num_tokens": 407199506.0, + "step": 11168 + }, + { + "epoch": 2.0740947075208913, + "grad_norm": 1.7185407876968384, + "learning_rate": 1e-06, + "loss": 0.2559, + "mean_token_accuracy": 0.9033512473106384, + "num_tokens": 407225885.0, + "step": 11169 + }, + { + "epoch": 2.074280408542247, + "grad_norm": 1.673176646232605, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8755543828010559, + "num_tokens": 407261492.0, + "step": 11170 + }, + { + "epoch": 2.074466109563603, + "grad_norm": 1.740936517715454, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8743858337402344, + "num_tokens": 407299618.0, + "step": 11171 + }, + { + "epoch": 2.074651810584958, + "grad_norm": 1.522154688835144, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.9004733562469482, + "num_tokens": 407340192.0, + "step": 11172 + }, + { + "epoch": 2.074837511606314, + "grad_norm": 1.5997464656829834, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8961248993873596, + "num_tokens": 407374128.0, + "step": 11173 + }, + { + "epoch": 2.0750232126276695, + "grad_norm": 1.620800495147705, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8736181855201721, + "num_tokens": 407412590.0, + "step": 11174 + }, + { + "epoch": 2.0752089136490253, + "grad_norm": 1.4887917041778564, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8876184225082397, + "num_tokens": 407455526.0, + "step": 11175 + }, + { + "epoch": 2.0753946146703806, + "grad_norm": 1.676370620727539, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8717842698097229, + "num_tokens": 407492014.0, + "step": 11176 + }, + { + "epoch": 2.0755803156917363, + "grad_norm": 1.6545237302780151, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8913732171058655, + "num_tokens": 407525472.0, + "step": 11177 + }, + { + "epoch": 2.075766016713092, + "grad_norm": 1.5141422748565674, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8951917886734009, + "num_tokens": 407563926.0, + "step": 11178 + }, + { + "epoch": 2.0759517177344478, + "grad_norm": 1.6996992826461792, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8909692764282227, + "num_tokens": 407600366.0, + "step": 11179 + }, + { + "epoch": 2.076137418755803, + "grad_norm": 1.719815731048584, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8786529898643494, + "num_tokens": 407631452.0, + "step": 11180 + }, + { + "epoch": 2.0763231197771588, + "grad_norm": 1.6382559537887573, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8866209387779236, + "num_tokens": 407666202.0, + "step": 11181 + }, + { + "epoch": 2.0765088207985145, + "grad_norm": 1.6482666730880737, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8855483531951904, + "num_tokens": 407702494.0, + "step": 11182 + }, + { + "epoch": 2.07669452181987, + "grad_norm": 1.5637246370315552, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8889193534851074, + "num_tokens": 407743039.0, + "step": 11183 + }, + { + "epoch": 2.0768802228412255, + "grad_norm": 1.725890874862671, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8886715173721313, + "num_tokens": 407776783.0, + "step": 11184 + }, + { + "epoch": 2.0770659238625813, + "grad_norm": 1.7266426086425781, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8865033388137817, + "num_tokens": 407813729.0, + "step": 11185 + }, + { + "epoch": 2.077251624883937, + "grad_norm": 1.5922540426254272, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8951952457427979, + "num_tokens": 407849174.0, + "step": 11186 + }, + { + "epoch": 2.0774373259052923, + "grad_norm": 1.621208667755127, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8876562714576721, + "num_tokens": 407883892.0, + "step": 11187 + }, + { + "epoch": 2.077623026926648, + "grad_norm": 1.5749324560165405, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8908337354660034, + "num_tokens": 407920108.0, + "step": 11188 + }, + { + "epoch": 2.0778087279480038, + "grad_norm": 1.4873337745666504, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8894833326339722, + "num_tokens": 407959755.0, + "step": 11189 + }, + { + "epoch": 2.0779944289693595, + "grad_norm": 1.5098267793655396, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8900863528251648, + "num_tokens": 407999513.0, + "step": 11190 + }, + { + "epoch": 2.0781801299907148, + "grad_norm": 1.486657738685608, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8917690515518188, + "num_tokens": 408039730.0, + "step": 11191 + }, + { + "epoch": 2.0783658310120705, + "grad_norm": 1.6355727910995483, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8790795803070068, + "num_tokens": 408077582.0, + "step": 11192 + }, + { + "epoch": 2.0785515320334262, + "grad_norm": 1.5008935928344727, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.890654444694519, + "num_tokens": 408118423.0, + "step": 11193 + }, + { + "epoch": 2.078737233054782, + "grad_norm": 1.6710319519042969, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8808751702308655, + "num_tokens": 408157348.0, + "step": 11194 + }, + { + "epoch": 2.0789229340761373, + "grad_norm": 1.6577553749084473, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.891768753528595, + "num_tokens": 408192186.0, + "step": 11195 + }, + { + "epoch": 2.079108635097493, + "grad_norm": 1.591936469078064, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8884269595146179, + "num_tokens": 408226004.0, + "step": 11196 + }, + { + "epoch": 2.0792943361188487, + "grad_norm": 1.6412971019744873, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8840333819389343, + "num_tokens": 408261150.0, + "step": 11197 + }, + { + "epoch": 2.0794800371402045, + "grad_norm": 1.5386590957641602, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8872296810150146, + "num_tokens": 408300698.0, + "step": 11198 + }, + { + "epoch": 2.0796657381615598, + "grad_norm": 1.7158373594284058, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8860633969306946, + "num_tokens": 408336701.0, + "step": 11199 + }, + { + "epoch": 2.0798514391829155, + "grad_norm": 1.605394721031189, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8844349384307861, + "num_tokens": 408373670.0, + "step": 11200 + }, + { + "epoch": 2.080037140204271, + "grad_norm": 1.78180730342865, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8742607235908508, + "num_tokens": 408405387.0, + "step": 11201 + }, + { + "epoch": 2.080222841225627, + "grad_norm": 1.5487396717071533, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8795568943023682, + "num_tokens": 408446236.0, + "step": 11202 + }, + { + "epoch": 2.0804085422469822, + "grad_norm": 1.6455427408218384, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8811996579170227, + "num_tokens": 408481206.0, + "step": 11203 + }, + { + "epoch": 2.080594243268338, + "grad_norm": 1.4783744812011719, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8920883536338806, + "num_tokens": 408524679.0, + "step": 11204 + }, + { + "epoch": 2.0807799442896937, + "grad_norm": 1.418431282043457, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8966843485832214, + "num_tokens": 408569995.0, + "step": 11205 + }, + { + "epoch": 2.080965645311049, + "grad_norm": 1.6120567321777344, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8896220326423645, + "num_tokens": 408608245.0, + "step": 11206 + }, + { + "epoch": 2.0811513463324047, + "grad_norm": 1.649319052696228, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8784092664718628, + "num_tokens": 408646418.0, + "step": 11207 + }, + { + "epoch": 2.0813370473537605, + "grad_norm": 1.7321721315383911, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8849459886550903, + "num_tokens": 408680661.0, + "step": 11208 + }, + { + "epoch": 2.081522748375116, + "grad_norm": 1.5338295698165894, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8941100239753723, + "num_tokens": 408720985.0, + "step": 11209 + }, + { + "epoch": 2.0817084493964715, + "grad_norm": 1.6543879508972168, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.888196587562561, + "num_tokens": 408753787.0, + "step": 11210 + }, + { + "epoch": 2.081894150417827, + "grad_norm": 1.656835675239563, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8744719624519348, + "num_tokens": 408794186.0, + "step": 11211 + }, + { + "epoch": 2.082079851439183, + "grad_norm": 1.6230915784835815, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.893378496170044, + "num_tokens": 408827215.0, + "step": 11212 + }, + { + "epoch": 2.0822655524605387, + "grad_norm": 1.7019833326339722, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8842670321464539, + "num_tokens": 408861463.0, + "step": 11213 + }, + { + "epoch": 2.082451253481894, + "grad_norm": 1.6809382438659668, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8861832618713379, + "num_tokens": 408893869.0, + "step": 11214 + }, + { + "epoch": 2.0826369545032497, + "grad_norm": 1.778857707977295, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8688097596168518, + "num_tokens": 408930917.0, + "step": 11215 + }, + { + "epoch": 2.0828226555246054, + "grad_norm": 1.7261732816696167, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8755208253860474, + "num_tokens": 408963208.0, + "step": 11216 + }, + { + "epoch": 2.083008356545961, + "grad_norm": 1.6962742805480957, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8891860246658325, + "num_tokens": 408996965.0, + "step": 11217 + }, + { + "epoch": 2.0831940575673165, + "grad_norm": 1.6167033910751343, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8824996948242188, + "num_tokens": 409034401.0, + "step": 11218 + }, + { + "epoch": 2.083379758588672, + "grad_norm": 1.8198233842849731, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8797598481178284, + "num_tokens": 409066723.0, + "step": 11219 + }, + { + "epoch": 2.083565459610028, + "grad_norm": 1.6336941719055176, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8849762678146362, + "num_tokens": 409102862.0, + "step": 11220 + }, + { + "epoch": 2.0837511606313837, + "grad_norm": 1.6543142795562744, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8920466899871826, + "num_tokens": 409135855.0, + "step": 11221 + }, + { + "epoch": 2.083936861652739, + "grad_norm": 1.576696753501892, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8837710618972778, + "num_tokens": 409173644.0, + "step": 11222 + }, + { + "epoch": 2.0841225626740947, + "grad_norm": 1.7128506898880005, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8848128318786621, + "num_tokens": 409207425.0, + "step": 11223 + }, + { + "epoch": 2.0843082636954504, + "grad_norm": 1.7579236030578613, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.886523425579071, + "num_tokens": 409237730.0, + "step": 11224 + }, + { + "epoch": 2.084493964716806, + "grad_norm": 1.5723357200622559, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8820909857749939, + "num_tokens": 409278346.0, + "step": 11225 + }, + { + "epoch": 2.0846796657381614, + "grad_norm": 1.6846084594726562, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8832905292510986, + "num_tokens": 409314513.0, + "step": 11226 + }, + { + "epoch": 2.084865366759517, + "grad_norm": 1.8339178562164307, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8749865293502808, + "num_tokens": 409348926.0, + "step": 11227 + }, + { + "epoch": 2.085051067780873, + "grad_norm": 1.5989978313446045, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.8962939977645874, + "num_tokens": 409387240.0, + "step": 11228 + }, + { + "epoch": 2.085236768802228, + "grad_norm": 1.579425573348999, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8987962007522583, + "num_tokens": 409426508.0, + "step": 11229 + }, + { + "epoch": 2.085422469823584, + "grad_norm": 1.5634506940841675, + "learning_rate": 1e-06, + "loss": 0.2548, + "mean_token_accuracy": 0.9061328172683716, + "num_tokens": 409457759.0, + "step": 11230 + }, + { + "epoch": 2.0856081708449397, + "grad_norm": 1.8668581247329712, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8774015307426453, + "num_tokens": 409488356.0, + "step": 11231 + }, + { + "epoch": 2.0857938718662954, + "grad_norm": 1.6138801574707031, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8951818943023682, + "num_tokens": 409522663.0, + "step": 11232 + }, + { + "epoch": 2.0859795728876507, + "grad_norm": 1.9846419095993042, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8810798525810242, + "num_tokens": 409557183.0, + "step": 11233 + }, + { + "epoch": 2.0861652739090064, + "grad_norm": 1.6375470161437988, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.879709005355835, + "num_tokens": 409596970.0, + "step": 11234 + }, + { + "epoch": 2.086350974930362, + "grad_norm": 1.7499090433120728, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8809833526611328, + "num_tokens": 409631133.0, + "step": 11235 + }, + { + "epoch": 2.086536675951718, + "grad_norm": 1.7050776481628418, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8840253353118896, + "num_tokens": 409662726.0, + "step": 11236 + }, + { + "epoch": 2.086722376973073, + "grad_norm": 1.5047216415405273, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.9003346562385559, + "num_tokens": 409702358.0, + "step": 11237 + }, + { + "epoch": 2.086908077994429, + "grad_norm": 1.7165946960449219, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8912866115570068, + "num_tokens": 409736393.0, + "step": 11238 + }, + { + "epoch": 2.0870937790157846, + "grad_norm": 1.7327417135238647, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8887580037117004, + "num_tokens": 409769064.0, + "step": 11239 + }, + { + "epoch": 2.0872794800371404, + "grad_norm": 1.6495438814163208, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8896660804748535, + "num_tokens": 409802515.0, + "step": 11240 + }, + { + "epoch": 2.0874651810584957, + "grad_norm": 1.6334892511367798, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8844960927963257, + "num_tokens": 409842592.0, + "step": 11241 + }, + { + "epoch": 2.0876508820798514, + "grad_norm": 1.7669429779052734, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8867124915122986, + "num_tokens": 409873458.0, + "step": 11242 + }, + { + "epoch": 2.087836583101207, + "grad_norm": 1.7279767990112305, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8753204941749573, + "num_tokens": 409904889.0, + "step": 11243 + }, + { + "epoch": 2.088022284122563, + "grad_norm": 1.6981362104415894, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8799177408218384, + "num_tokens": 409936808.0, + "step": 11244 + }, + { + "epoch": 2.088207985143918, + "grad_norm": 1.6018588542938232, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.882255494594574, + "num_tokens": 409971967.0, + "step": 11245 + }, + { + "epoch": 2.088393686165274, + "grad_norm": 1.5618236064910889, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8886717557907104, + "num_tokens": 410008805.0, + "step": 11246 + }, + { + "epoch": 2.0885793871866296, + "grad_norm": 1.4504797458648682, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8954524993896484, + "num_tokens": 410050090.0, + "step": 11247 + }, + { + "epoch": 2.0887650882079853, + "grad_norm": 1.6365493535995483, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8980802297592163, + "num_tokens": 410082229.0, + "step": 11248 + }, + { + "epoch": 2.0889507892293406, + "grad_norm": 1.798865795135498, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8911547660827637, + "num_tokens": 410113047.0, + "step": 11249 + }, + { + "epoch": 2.0891364902506964, + "grad_norm": 1.5965365171432495, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8947164416313171, + "num_tokens": 410147298.0, + "step": 11250 + }, + { + "epoch": 2.089322191272052, + "grad_norm": 1.516728401184082, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8910863399505615, + "num_tokens": 410186583.0, + "step": 11251 + }, + { + "epoch": 2.0895078922934074, + "grad_norm": 1.6237636804580688, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8943418860435486, + "num_tokens": 410228989.0, + "step": 11252 + }, + { + "epoch": 2.089693593314763, + "grad_norm": 1.557864785194397, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8954898118972778, + "num_tokens": 410264913.0, + "step": 11253 + }, + { + "epoch": 2.089879294336119, + "grad_norm": 1.6066524982452393, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8919261693954468, + "num_tokens": 410299394.0, + "step": 11254 + }, + { + "epoch": 2.0900649953574746, + "grad_norm": 1.7945573329925537, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8898493051528931, + "num_tokens": 410329086.0, + "step": 11255 + }, + { + "epoch": 2.09025069637883, + "grad_norm": 1.5562055110931396, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8857529163360596, + "num_tokens": 410368506.0, + "step": 11256 + }, + { + "epoch": 2.0904363974001856, + "grad_norm": 1.686408281326294, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8750141859054565, + "num_tokens": 410402073.0, + "step": 11257 + }, + { + "epoch": 2.0906220984215413, + "grad_norm": 1.5748817920684814, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8763853311538696, + "num_tokens": 410441049.0, + "step": 11258 + }, + { + "epoch": 2.090807799442897, + "grad_norm": 1.5609210729599, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8934856653213501, + "num_tokens": 410478065.0, + "step": 11259 + }, + { + "epoch": 2.0909935004642524, + "grad_norm": 1.4541223049163818, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8935522437095642, + "num_tokens": 410523222.0, + "step": 11260 + }, + { + "epoch": 2.091179201485608, + "grad_norm": 1.6751468181610107, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8831157684326172, + "num_tokens": 410557466.0, + "step": 11261 + }, + { + "epoch": 2.091364902506964, + "grad_norm": 1.6099143028259277, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8958205580711365, + "num_tokens": 410591058.0, + "step": 11262 + }, + { + "epoch": 2.0915506035283196, + "grad_norm": 1.8525078296661377, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8653770089149475, + "num_tokens": 410624035.0, + "step": 11263 + }, + { + "epoch": 2.091736304549675, + "grad_norm": 1.5793952941894531, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8867092132568359, + "num_tokens": 410659662.0, + "step": 11264 + }, + { + "epoch": 2.0919220055710306, + "grad_norm": 1.7013038396835327, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8907134532928467, + "num_tokens": 410690593.0, + "step": 11265 + }, + { + "epoch": 2.0921077065923863, + "grad_norm": 1.7031506299972534, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.878277063369751, + "num_tokens": 410725822.0, + "step": 11266 + }, + { + "epoch": 2.092293407613742, + "grad_norm": 1.8033944368362427, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8936955332756042, + "num_tokens": 410764998.0, + "step": 11267 + }, + { + "epoch": 2.0924791086350973, + "grad_norm": 1.5968883037567139, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8782361745834351, + "num_tokens": 410804401.0, + "step": 11268 + }, + { + "epoch": 2.092664809656453, + "grad_norm": 1.6838457584381104, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8939177393913269, + "num_tokens": 410834814.0, + "step": 11269 + }, + { + "epoch": 2.092850510677809, + "grad_norm": 1.6088337898254395, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8741813898086548, + "num_tokens": 410880107.0, + "step": 11270 + }, + { + "epoch": 2.0930362116991645, + "grad_norm": 1.606257677078247, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8890706300735474, + "num_tokens": 410916108.0, + "step": 11271 + }, + { + "epoch": 2.09322191272052, + "grad_norm": 1.6699174642562866, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8891428709030151, + "num_tokens": 410953293.0, + "step": 11272 + }, + { + "epoch": 2.0934076137418756, + "grad_norm": 1.5393251180648804, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8954956531524658, + "num_tokens": 410988444.0, + "step": 11273 + }, + { + "epoch": 2.0935933147632313, + "grad_norm": 1.7558622360229492, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8779721260070801, + "num_tokens": 411021629.0, + "step": 11274 + }, + { + "epoch": 2.093779015784587, + "grad_norm": 1.8667585849761963, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8747861385345459, + "num_tokens": 411055904.0, + "step": 11275 + }, + { + "epoch": 2.0939647168059423, + "grad_norm": 1.7671587467193604, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8843960762023926, + "num_tokens": 411087999.0, + "step": 11276 + }, + { + "epoch": 2.094150417827298, + "grad_norm": 1.6427100896835327, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.8996951580047607, + "num_tokens": 411121573.0, + "step": 11277 + }, + { + "epoch": 2.094336118848654, + "grad_norm": 1.609791874885559, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8855682611465454, + "num_tokens": 411159149.0, + "step": 11278 + }, + { + "epoch": 2.094521819870009, + "grad_norm": 1.7815583944320679, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8839666247367859, + "num_tokens": 411193338.0, + "step": 11279 + }, + { + "epoch": 2.094707520891365, + "grad_norm": 1.6778007745742798, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.885463535785675, + "num_tokens": 411227042.0, + "step": 11280 + }, + { + "epoch": 2.0948932219127205, + "grad_norm": 1.6575572490692139, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8741568326950073, + "num_tokens": 411266909.0, + "step": 11281 + }, + { + "epoch": 2.0950789229340763, + "grad_norm": 1.6854959726333618, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8872635960578918, + "num_tokens": 411303055.0, + "step": 11282 + }, + { + "epoch": 2.0952646239554316, + "grad_norm": 1.5871460437774658, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8941390514373779, + "num_tokens": 411338917.0, + "step": 11283 + }, + { + "epoch": 2.0954503249767873, + "grad_norm": 1.5149171352386475, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8915979862213135, + "num_tokens": 411378393.0, + "step": 11284 + }, + { + "epoch": 2.095636025998143, + "grad_norm": 1.637969732284546, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8838905096054077, + "num_tokens": 411413686.0, + "step": 11285 + }, + { + "epoch": 2.0958217270194988, + "grad_norm": 1.6295982599258423, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8915644288063049, + "num_tokens": 411448602.0, + "step": 11286 + }, + { + "epoch": 2.096007428040854, + "grad_norm": 1.514131784439087, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8933145403862, + "num_tokens": 411486980.0, + "step": 11287 + }, + { + "epoch": 2.09619312906221, + "grad_norm": 1.6376514434814453, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8822773694992065, + "num_tokens": 411524111.0, + "step": 11288 + }, + { + "epoch": 2.0963788300835655, + "grad_norm": 1.5505988597869873, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8877643942832947, + "num_tokens": 411560310.0, + "step": 11289 + }, + { + "epoch": 2.0965645311049212, + "grad_norm": 1.6491303443908691, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8877708315849304, + "num_tokens": 411597911.0, + "step": 11290 + }, + { + "epoch": 2.0967502321262765, + "grad_norm": 1.7289066314697266, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8822386264801025, + "num_tokens": 411630116.0, + "step": 11291 + }, + { + "epoch": 2.0969359331476323, + "grad_norm": 1.5788785219192505, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8913134932518005, + "num_tokens": 411670007.0, + "step": 11292 + }, + { + "epoch": 2.097121634168988, + "grad_norm": 1.6042040586471558, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8941511511802673, + "num_tokens": 411708533.0, + "step": 11293 + }, + { + "epoch": 2.0973073351903437, + "grad_norm": 1.59596586227417, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8918241858482361, + "num_tokens": 411746431.0, + "step": 11294 + }, + { + "epoch": 2.097493036211699, + "grad_norm": 1.5488245487213135, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8823825120925903, + "num_tokens": 411785182.0, + "step": 11295 + }, + { + "epoch": 2.0976787372330548, + "grad_norm": 1.5739107131958008, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8890029191970825, + "num_tokens": 411820080.0, + "step": 11296 + }, + { + "epoch": 2.0978644382544105, + "grad_norm": 1.4757112264633179, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8720494508743286, + "num_tokens": 411866653.0, + "step": 11297 + }, + { + "epoch": 2.0980501392757662, + "grad_norm": 1.5469489097595215, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8934183120727539, + "num_tokens": 411904436.0, + "step": 11298 + }, + { + "epoch": 2.0982358402971215, + "grad_norm": 1.5965672731399536, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8791425824165344, + "num_tokens": 411942497.0, + "step": 11299 + }, + { + "epoch": 2.0984215413184772, + "grad_norm": 1.6709791421890259, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8878430724143982, + "num_tokens": 411978306.0, + "step": 11300 + }, + { + "epoch": 2.098607242339833, + "grad_norm": 1.5034418106079102, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8853111267089844, + "num_tokens": 412019048.0, + "step": 11301 + }, + { + "epoch": 2.0987929433611887, + "grad_norm": 1.6956473588943481, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8938016891479492, + "num_tokens": 412050081.0, + "step": 11302 + }, + { + "epoch": 2.098978644382544, + "grad_norm": 1.6154348850250244, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8882946968078613, + "num_tokens": 412086970.0, + "step": 11303 + }, + { + "epoch": 2.0991643454038997, + "grad_norm": 1.6205304861068726, + "learning_rate": 1e-06, + "loss": 0.2744, + "mean_token_accuracy": 0.8957436084747314, + "num_tokens": 412120076.0, + "step": 11304 + }, + { + "epoch": 2.0993500464252555, + "grad_norm": 1.689515471458435, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8895704746246338, + "num_tokens": 412152746.0, + "step": 11305 + }, + { + "epoch": 2.0995357474466108, + "grad_norm": 1.6579006910324097, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8664636611938477, + "num_tokens": 412188948.0, + "step": 11306 + }, + { + "epoch": 2.0997214484679665, + "grad_norm": 1.4903310537338257, + "learning_rate": 1e-06, + "loss": 0.264, + "mean_token_accuracy": 0.9036885499954224, + "num_tokens": 412224250.0, + "step": 11307 + }, + { + "epoch": 2.0999071494893222, + "grad_norm": 1.5170496702194214, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8888379335403442, + "num_tokens": 412264972.0, + "step": 11308 + }, + { + "epoch": 2.100092850510678, + "grad_norm": 1.7459014654159546, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.885697603225708, + "num_tokens": 412294761.0, + "step": 11309 + }, + { + "epoch": 2.1002785515320332, + "grad_norm": 1.4992194175720215, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8902667760848999, + "num_tokens": 412335756.0, + "step": 11310 + }, + { + "epoch": 2.100464252553389, + "grad_norm": 1.5080535411834717, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8877241611480713, + "num_tokens": 412372245.0, + "step": 11311 + }, + { + "epoch": 2.1006499535747447, + "grad_norm": 1.7910141944885254, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8787193894386292, + "num_tokens": 412406827.0, + "step": 11312 + }, + { + "epoch": 2.1008356545961004, + "grad_norm": 1.6184813976287842, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8847947120666504, + "num_tokens": 412444390.0, + "step": 11313 + }, + { + "epoch": 2.1010213556174557, + "grad_norm": 1.5487574338912964, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8944351673126221, + "num_tokens": 412480799.0, + "step": 11314 + }, + { + "epoch": 2.1012070566388115, + "grad_norm": 1.652856469154358, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8720168471336365, + "num_tokens": 412519382.0, + "step": 11315 + }, + { + "epoch": 2.101392757660167, + "grad_norm": 1.5851722955703735, + "learning_rate": 1e-06, + "loss": 0.2524, + "mean_token_accuracy": 0.9076999425888062, + "num_tokens": 412552085.0, + "step": 11316 + }, + { + "epoch": 2.101578458681523, + "grad_norm": 1.679252028465271, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8862850666046143, + "num_tokens": 412587903.0, + "step": 11317 + }, + { + "epoch": 2.101764159702878, + "grad_norm": 1.6086223125457764, + "learning_rate": 1e-06, + "loss": 0.2646, + "mean_token_accuracy": 0.9026201963424683, + "num_tokens": 412624388.0, + "step": 11318 + }, + { + "epoch": 2.101949860724234, + "grad_norm": 1.5253499746322632, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8837909698486328, + "num_tokens": 412668701.0, + "step": 11319 + }, + { + "epoch": 2.1021355617455897, + "grad_norm": 1.659752368927002, + "learning_rate": 1e-06, + "loss": 0.2656, + "mean_token_accuracy": 0.9014106392860413, + "num_tokens": 412701906.0, + "step": 11320 + }, + { + "epoch": 2.1023212627669454, + "grad_norm": 1.6283913850784302, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8931466341018677, + "num_tokens": 412738361.0, + "step": 11321 + }, + { + "epoch": 2.1025069637883007, + "grad_norm": 1.5795150995254517, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8909094929695129, + "num_tokens": 412775900.0, + "step": 11322 + }, + { + "epoch": 2.1026926648096564, + "grad_norm": 1.4760277271270752, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8920449614524841, + "num_tokens": 412815439.0, + "step": 11323 + }, + { + "epoch": 2.102878365831012, + "grad_norm": 1.5836516618728638, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8908042311668396, + "num_tokens": 412851738.0, + "step": 11324 + }, + { + "epoch": 2.103064066852368, + "grad_norm": 1.6009912490844727, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.893751323223114, + "num_tokens": 412885064.0, + "step": 11325 + }, + { + "epoch": 2.103249767873723, + "grad_norm": 1.7505031824111938, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8812077641487122, + "num_tokens": 412917409.0, + "step": 11326 + }, + { + "epoch": 2.103435468895079, + "grad_norm": 1.5963784456253052, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8750388026237488, + "num_tokens": 412956762.0, + "step": 11327 + }, + { + "epoch": 2.1036211699164347, + "grad_norm": 1.5811225175857544, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.882463812828064, + "num_tokens": 412996074.0, + "step": 11328 + }, + { + "epoch": 2.10380687093779, + "grad_norm": 1.613625407218933, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8908936977386475, + "num_tokens": 413031541.0, + "step": 11329 + }, + { + "epoch": 2.1039925719591457, + "grad_norm": 1.5994899272918701, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8922523856163025, + "num_tokens": 413068101.0, + "step": 11330 + }, + { + "epoch": 2.1041782729805014, + "grad_norm": 1.4229941368103027, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8970807194709778, + "num_tokens": 413109672.0, + "step": 11331 + }, + { + "epoch": 2.104363974001857, + "grad_norm": 1.6763734817504883, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8844446539878845, + "num_tokens": 413146387.0, + "step": 11332 + }, + { + "epoch": 2.1045496750232124, + "grad_norm": 1.6659730672836304, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8854411244392395, + "num_tokens": 413179797.0, + "step": 11333 + }, + { + "epoch": 2.104735376044568, + "grad_norm": 1.6175129413604736, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8924828767776489, + "num_tokens": 413215283.0, + "step": 11334 + }, + { + "epoch": 2.104921077065924, + "grad_norm": 1.6450530290603638, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.884065568447113, + "num_tokens": 413250930.0, + "step": 11335 + }, + { + "epoch": 2.1051067780872796, + "grad_norm": 1.600353717803955, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8814610838890076, + "num_tokens": 413289102.0, + "step": 11336 + }, + { + "epoch": 2.105292479108635, + "grad_norm": 1.5774720907211304, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.890820324420929, + "num_tokens": 413327775.0, + "step": 11337 + }, + { + "epoch": 2.1054781801299907, + "grad_norm": 1.608038306236267, + "learning_rate": 1e-06, + "loss": 0.2744, + "mean_token_accuracy": 0.9034145474433899, + "num_tokens": 413363305.0, + "step": 11338 + }, + { + "epoch": 2.1056638811513464, + "grad_norm": 1.569106936454773, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8939220309257507, + "num_tokens": 413398460.0, + "step": 11339 + }, + { + "epoch": 2.105849582172702, + "grad_norm": 1.4911973476409912, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8946705460548401, + "num_tokens": 413435990.0, + "step": 11340 + }, + { + "epoch": 2.1060352831940574, + "grad_norm": 1.7138489484786987, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8746432065963745, + "num_tokens": 413472332.0, + "step": 11341 + }, + { + "epoch": 2.106220984215413, + "grad_norm": 1.4708460569381714, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8946640491485596, + "num_tokens": 413511641.0, + "step": 11342 + }, + { + "epoch": 2.106406685236769, + "grad_norm": 1.6760809421539307, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8868496417999268, + "num_tokens": 413547302.0, + "step": 11343 + }, + { + "epoch": 2.1065923862581246, + "grad_norm": 1.9872456789016724, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8935791254043579, + "num_tokens": 413577880.0, + "step": 11344 + }, + { + "epoch": 2.10677808727948, + "grad_norm": 2.0032670497894287, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8756604194641113, + "num_tokens": 413610998.0, + "step": 11345 + }, + { + "epoch": 2.1069637883008356, + "grad_norm": 1.4612566232681274, + "learning_rate": 1e-06, + "loss": 0.265, + "mean_token_accuracy": 0.902768611907959, + "num_tokens": 413649620.0, + "step": 11346 + }, + { + "epoch": 2.1071494893221914, + "grad_norm": 1.5292160511016846, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.879356324672699, + "num_tokens": 413692167.0, + "step": 11347 + }, + { + "epoch": 2.107335190343547, + "grad_norm": 1.5990883111953735, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8871509432792664, + "num_tokens": 413731287.0, + "step": 11348 + }, + { + "epoch": 2.1075208913649024, + "grad_norm": 1.7013611793518066, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8682281970977783, + "num_tokens": 413769656.0, + "step": 11349 + }, + { + "epoch": 2.107706592386258, + "grad_norm": 1.622052788734436, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.887235164642334, + "num_tokens": 413809290.0, + "step": 11350 + }, + { + "epoch": 2.107892293407614, + "grad_norm": 1.6469917297363281, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8867826461791992, + "num_tokens": 413845170.0, + "step": 11351 + }, + { + "epoch": 2.108077994428969, + "grad_norm": 1.717156171798706, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8782685399055481, + "num_tokens": 413880877.0, + "step": 11352 + }, + { + "epoch": 2.108263695450325, + "grad_norm": 1.71677565574646, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8767125010490417, + "num_tokens": 413915318.0, + "step": 11353 + }, + { + "epoch": 2.1084493964716806, + "grad_norm": 1.857229232788086, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8840006589889526, + "num_tokens": 413946464.0, + "step": 11354 + }, + { + "epoch": 2.1086350974930363, + "grad_norm": 1.6342008113861084, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8943997621536255, + "num_tokens": 413987830.0, + "step": 11355 + }, + { + "epoch": 2.1088207985143916, + "grad_norm": 1.8138020038604736, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8867002725601196, + "num_tokens": 414021616.0, + "step": 11356 + }, + { + "epoch": 2.1090064995357474, + "grad_norm": 1.6578019857406616, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9000748991966248, + "num_tokens": 414056344.0, + "step": 11357 + }, + { + "epoch": 2.109192200557103, + "grad_norm": 1.8537923097610474, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.867967426776886, + "num_tokens": 414088624.0, + "step": 11358 + }, + { + "epoch": 2.109377901578459, + "grad_norm": 1.7282487154006958, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8796286582946777, + "num_tokens": 414120723.0, + "step": 11359 + }, + { + "epoch": 2.109563602599814, + "grad_norm": 1.5466642379760742, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8894069194793701, + "num_tokens": 414158679.0, + "step": 11360 + }, + { + "epoch": 2.10974930362117, + "grad_norm": 1.6557468175888062, + "learning_rate": 1e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.902355432510376, + "num_tokens": 414192459.0, + "step": 11361 + }, + { + "epoch": 2.1099350046425256, + "grad_norm": 1.6817634105682373, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8803511261940002, + "num_tokens": 414228383.0, + "step": 11362 + }, + { + "epoch": 2.1101207056638813, + "grad_norm": 1.7408448457717896, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8815255165100098, + "num_tokens": 414260662.0, + "step": 11363 + }, + { + "epoch": 2.1103064066852366, + "grad_norm": 1.58211088180542, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8910857439041138, + "num_tokens": 414298679.0, + "step": 11364 + }, + { + "epoch": 2.1104921077065923, + "grad_norm": 1.5740951299667358, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8867788314819336, + "num_tokens": 414335666.0, + "step": 11365 + }, + { + "epoch": 2.110677808727948, + "grad_norm": 1.630661964416504, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8865410089492798, + "num_tokens": 414371386.0, + "step": 11366 + }, + { + "epoch": 2.110863509749304, + "grad_norm": 1.6157336235046387, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.894639253616333, + "num_tokens": 414403662.0, + "step": 11367 + }, + { + "epoch": 2.111049210770659, + "grad_norm": 1.654597282409668, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8794275522232056, + "num_tokens": 414438422.0, + "step": 11368 + }, + { + "epoch": 2.111234911792015, + "grad_norm": 1.6232253313064575, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8951582908630371, + "num_tokens": 414470809.0, + "step": 11369 + }, + { + "epoch": 2.1114206128133706, + "grad_norm": 1.661394715309143, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8775099515914917, + "num_tokens": 414509077.0, + "step": 11370 + }, + { + "epoch": 2.1116063138347263, + "grad_norm": 1.6845486164093018, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8934276103973389, + "num_tokens": 414541261.0, + "step": 11371 + }, + { + "epoch": 2.1117920148560816, + "grad_norm": 1.6164462566375732, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.886528730392456, + "num_tokens": 414575130.0, + "step": 11372 + }, + { + "epoch": 2.1119777158774373, + "grad_norm": 1.4094401597976685, + "learning_rate": 1e-06, + "loss": 0.2674, + "mean_token_accuracy": 0.9038959741592407, + "num_tokens": 414613953.0, + "step": 11373 + }, + { + "epoch": 2.112163416898793, + "grad_norm": 1.5478571653366089, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8919195532798767, + "num_tokens": 414649047.0, + "step": 11374 + }, + { + "epoch": 2.1123491179201483, + "grad_norm": 1.6199778318405151, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8945342302322388, + "num_tokens": 414683767.0, + "step": 11375 + }, + { + "epoch": 2.112534818941504, + "grad_norm": 1.5428826808929443, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.891391932964325, + "num_tokens": 414720681.0, + "step": 11376 + }, + { + "epoch": 2.11272051996286, + "grad_norm": 1.6126822233200073, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8869198560714722, + "num_tokens": 414759323.0, + "step": 11377 + }, + { + "epoch": 2.1129062209842155, + "grad_norm": 1.6222440004348755, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8869509100914001, + "num_tokens": 414794977.0, + "step": 11378 + }, + { + "epoch": 2.113091922005571, + "grad_norm": 1.5567266941070557, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8937592506408691, + "num_tokens": 414831940.0, + "step": 11379 + }, + { + "epoch": 2.1132776230269266, + "grad_norm": 1.7972722053527832, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.872367262840271, + "num_tokens": 414867439.0, + "step": 11380 + }, + { + "epoch": 2.1134633240482823, + "grad_norm": 1.5697600841522217, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8907934427261353, + "num_tokens": 414901178.0, + "step": 11381 + }, + { + "epoch": 2.113649025069638, + "grad_norm": 1.5926628112792969, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8943907022476196, + "num_tokens": 414935687.0, + "step": 11382 + }, + { + "epoch": 2.1138347260909933, + "grad_norm": 1.5977870225906372, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8835508227348328, + "num_tokens": 414973267.0, + "step": 11383 + }, + { + "epoch": 2.114020427112349, + "grad_norm": 1.7338624000549316, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8842884302139282, + "num_tokens": 415006873.0, + "step": 11384 + }, + { + "epoch": 2.114206128133705, + "grad_norm": 1.410082221031189, + "learning_rate": 1e-06, + "loss": 0.2739, + "mean_token_accuracy": 0.9015610814094543, + "num_tokens": 415045966.0, + "step": 11385 + }, + { + "epoch": 2.1143918291550605, + "grad_norm": 1.7430624961853027, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8755694627761841, + "num_tokens": 415079560.0, + "step": 11386 + }, + { + "epoch": 2.114577530176416, + "grad_norm": 1.6586631536483765, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8857155442237854, + "num_tokens": 415116368.0, + "step": 11387 + }, + { + "epoch": 2.1147632311977715, + "grad_norm": 1.7495843172073364, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8774389028549194, + "num_tokens": 415153449.0, + "step": 11388 + }, + { + "epoch": 2.1149489322191273, + "grad_norm": 1.6063164472579956, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8894088268280029, + "num_tokens": 415191486.0, + "step": 11389 + }, + { + "epoch": 2.115134633240483, + "grad_norm": 1.883872628211975, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8875733017921448, + "num_tokens": 415223504.0, + "step": 11390 + }, + { + "epoch": 2.1153203342618383, + "grad_norm": 1.5700178146362305, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8852686285972595, + "num_tokens": 415261836.0, + "step": 11391 + }, + { + "epoch": 2.115506035283194, + "grad_norm": 1.513387680053711, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.886534571647644, + "num_tokens": 415301138.0, + "step": 11392 + }, + { + "epoch": 2.1156917363045498, + "grad_norm": 1.6456717252731323, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8774172067642212, + "num_tokens": 415335334.0, + "step": 11393 + }, + { + "epoch": 2.1158774373259055, + "grad_norm": 1.528667688369751, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8918894529342651, + "num_tokens": 415373786.0, + "step": 11394 + }, + { + "epoch": 2.116063138347261, + "grad_norm": 1.4984444379806519, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8793372511863708, + "num_tokens": 415419214.0, + "step": 11395 + }, + { + "epoch": 2.1162488393686165, + "grad_norm": 1.6034728288650513, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8889876008033752, + "num_tokens": 415457643.0, + "step": 11396 + }, + { + "epoch": 2.1164345403899723, + "grad_norm": 1.5106353759765625, + "learning_rate": 1e-06, + "loss": 0.259, + "mean_token_accuracy": 0.904111385345459, + "num_tokens": 415495543.0, + "step": 11397 + }, + { + "epoch": 2.1166202414113275, + "grad_norm": 1.6964426040649414, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8949272036552429, + "num_tokens": 415528492.0, + "step": 11398 + }, + { + "epoch": 2.1168059424326833, + "grad_norm": 1.6553672552108765, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8830658197402954, + "num_tokens": 415562697.0, + "step": 11399 + }, + { + "epoch": 2.116991643454039, + "grad_norm": 1.724399209022522, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8792495727539062, + "num_tokens": 415593589.0, + "step": 11400 + }, + { + "epoch": 2.1171773444753947, + "grad_norm": 1.875467300415039, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.870957612991333, + "num_tokens": 415625746.0, + "step": 11401 + }, + { + "epoch": 2.11736304549675, + "grad_norm": 1.6472322940826416, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8873289823532104, + "num_tokens": 415658378.0, + "step": 11402 + }, + { + "epoch": 2.1175487465181058, + "grad_norm": 1.5673850774765015, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8815627098083496, + "num_tokens": 415698672.0, + "step": 11403 + }, + { + "epoch": 2.1177344475394615, + "grad_norm": 1.409567952156067, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8962899446487427, + "num_tokens": 415741243.0, + "step": 11404 + }, + { + "epoch": 2.1179201485608172, + "grad_norm": 1.5002492666244507, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8841760158538818, + "num_tokens": 415780628.0, + "step": 11405 + }, + { + "epoch": 2.1181058495821725, + "grad_norm": 1.5746853351593018, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.894497275352478, + "num_tokens": 415817029.0, + "step": 11406 + }, + { + "epoch": 2.1182915506035283, + "grad_norm": 1.634091854095459, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8750400543212891, + "num_tokens": 415852524.0, + "step": 11407 + }, + { + "epoch": 2.118477251624884, + "grad_norm": 1.690018653869629, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8783347606658936, + "num_tokens": 415888640.0, + "step": 11408 + }, + { + "epoch": 2.1186629526462397, + "grad_norm": 1.754227638244629, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8778489828109741, + "num_tokens": 415921442.0, + "step": 11409 + }, + { + "epoch": 2.118848653667595, + "grad_norm": 1.6431792974472046, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8740050792694092, + "num_tokens": 415958168.0, + "step": 11410 + }, + { + "epoch": 2.1190343546889507, + "grad_norm": 1.4100314378738403, + "learning_rate": 1e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9024925231933594, + "num_tokens": 416000108.0, + "step": 11411 + }, + { + "epoch": 2.1192200557103065, + "grad_norm": 1.7437607049942017, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8869483470916748, + "num_tokens": 416032307.0, + "step": 11412 + }, + { + "epoch": 2.119405756731662, + "grad_norm": 1.668631672859192, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8831695318222046, + "num_tokens": 416067963.0, + "step": 11413 + }, + { + "epoch": 2.1195914577530175, + "grad_norm": 1.5122709274291992, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8871673345565796, + "num_tokens": 416106877.0, + "step": 11414 + }, + { + "epoch": 2.1197771587743732, + "grad_norm": 1.6085094213485718, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8825721740722656, + "num_tokens": 416143341.0, + "step": 11415 + }, + { + "epoch": 2.119962859795729, + "grad_norm": 1.4972562789916992, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8929589986801147, + "num_tokens": 416180944.0, + "step": 11416 + }, + { + "epoch": 2.1201485608170847, + "grad_norm": 1.4837243556976318, + "learning_rate": 1e-06, + "loss": 0.2744, + "mean_token_accuracy": 0.8997869491577148, + "num_tokens": 416218054.0, + "step": 11417 + }, + { + "epoch": 2.12033426183844, + "grad_norm": 1.4726446866989136, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8911486268043518, + "num_tokens": 416259356.0, + "step": 11418 + }, + { + "epoch": 2.1205199628597957, + "grad_norm": 1.6365667581558228, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8878281116485596, + "num_tokens": 416294778.0, + "step": 11419 + }, + { + "epoch": 2.1207056638811514, + "grad_norm": 1.906428337097168, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8904094696044922, + "num_tokens": 416327066.0, + "step": 11420 + }, + { + "epoch": 2.1208913649025067, + "grad_norm": 1.5243972539901733, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8855435848236084, + "num_tokens": 416364816.0, + "step": 11421 + }, + { + "epoch": 2.1210770659238625, + "grad_norm": 1.659401774406433, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8782883882522583, + "num_tokens": 416404047.0, + "step": 11422 + }, + { + "epoch": 2.121262766945218, + "grad_norm": 1.5994985103607178, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8821605443954468, + "num_tokens": 416442957.0, + "step": 11423 + }, + { + "epoch": 2.121448467966574, + "grad_norm": 1.5844582319259644, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8937840461730957, + "num_tokens": 416477572.0, + "step": 11424 + }, + { + "epoch": 2.1216341689879292, + "grad_norm": 1.6756114959716797, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8879336714744568, + "num_tokens": 416514054.0, + "step": 11425 + }, + { + "epoch": 2.121819870009285, + "grad_norm": 1.6782170534133911, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8741562366485596, + "num_tokens": 416551754.0, + "step": 11426 + }, + { + "epoch": 2.1220055710306407, + "grad_norm": 1.5820865631103516, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8884223103523254, + "num_tokens": 416590050.0, + "step": 11427 + }, + { + "epoch": 2.1221912720519964, + "grad_norm": 1.5111005306243896, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8891960382461548, + "num_tokens": 416629185.0, + "step": 11428 + }, + { + "epoch": 2.1223769730733517, + "grad_norm": 1.5137754678726196, + "learning_rate": 1e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.8970807790756226, + "num_tokens": 416664915.0, + "step": 11429 + }, + { + "epoch": 2.1225626740947074, + "grad_norm": 1.84535551071167, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8570516705513, + "num_tokens": 416703591.0, + "step": 11430 + }, + { + "epoch": 2.122748375116063, + "grad_norm": 1.600977897644043, + "learning_rate": 1e-06, + "loss": 0.2569, + "mean_token_accuracy": 0.9012531042098999, + "num_tokens": 416734871.0, + "step": 11431 + }, + { + "epoch": 2.122934076137419, + "grad_norm": 1.60835862159729, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8687103986740112, + "num_tokens": 416773819.0, + "step": 11432 + }, + { + "epoch": 2.123119777158774, + "grad_norm": 1.6048120260238647, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8840638399124146, + "num_tokens": 416810250.0, + "step": 11433 + }, + { + "epoch": 2.12330547818013, + "grad_norm": 1.561439871788025, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8834450244903564, + "num_tokens": 416850572.0, + "step": 11434 + }, + { + "epoch": 2.1234911792014857, + "grad_norm": 1.567192792892456, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8797453045845032, + "num_tokens": 416888853.0, + "step": 11435 + }, + { + "epoch": 2.1236768802228414, + "grad_norm": 1.5789886713027954, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8822182416915894, + "num_tokens": 416926309.0, + "step": 11436 + }, + { + "epoch": 2.1238625812441967, + "grad_norm": 1.4642778635025024, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8807488083839417, + "num_tokens": 416970337.0, + "step": 11437 + }, + { + "epoch": 2.1240482822655524, + "grad_norm": 1.6863681077957153, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8874008655548096, + "num_tokens": 417000315.0, + "step": 11438 + }, + { + "epoch": 2.124233983286908, + "grad_norm": 1.5536161661148071, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8890150785446167, + "num_tokens": 417036406.0, + "step": 11439 + }, + { + "epoch": 2.124419684308264, + "grad_norm": 1.5779225826263428, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8772405385971069, + "num_tokens": 417078285.0, + "step": 11440 + }, + { + "epoch": 2.124605385329619, + "grad_norm": 1.5104321241378784, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.880264163017273, + "num_tokens": 417119878.0, + "step": 11441 + }, + { + "epoch": 2.124791086350975, + "grad_norm": 1.5687789916992188, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8946176767349243, + "num_tokens": 417154613.0, + "step": 11442 + }, + { + "epoch": 2.1249767873723306, + "grad_norm": 1.5172252655029297, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8824002146720886, + "num_tokens": 417193919.0, + "step": 11443 + }, + { + "epoch": 2.125162488393686, + "grad_norm": 1.524181842803955, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8795549869537354, + "num_tokens": 417234464.0, + "step": 11444 + }, + { + "epoch": 2.1253481894150417, + "grad_norm": 1.5390576124191284, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.878768801689148, + "num_tokens": 417273309.0, + "step": 11445 + }, + { + "epoch": 2.1255338904363974, + "grad_norm": 1.764331340789795, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8845018148422241, + "num_tokens": 417309626.0, + "step": 11446 + }, + { + "epoch": 2.125719591457753, + "grad_norm": 1.6424108743667603, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8880904316902161, + "num_tokens": 417346867.0, + "step": 11447 + }, + { + "epoch": 2.125905292479109, + "grad_norm": 1.5591373443603516, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8839755654335022, + "num_tokens": 417385478.0, + "step": 11448 + }, + { + "epoch": 2.126090993500464, + "grad_norm": 1.6452816724777222, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8844907283782959, + "num_tokens": 417423258.0, + "step": 11449 + }, + { + "epoch": 2.12627669452182, + "grad_norm": 1.7251574993133545, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8964656591415405, + "num_tokens": 417454935.0, + "step": 11450 + }, + { + "epoch": 2.1264623955431756, + "grad_norm": 1.6701408624649048, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8723745346069336, + "num_tokens": 417491632.0, + "step": 11451 + }, + { + "epoch": 2.126648096564531, + "grad_norm": 1.601144790649414, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8663479685783386, + "num_tokens": 417533583.0, + "step": 11452 + }, + { + "epoch": 2.1268337975858866, + "grad_norm": 1.5990458726882935, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8788856863975525, + "num_tokens": 417570917.0, + "step": 11453 + }, + { + "epoch": 2.1270194986072424, + "grad_norm": 1.576525092124939, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.8985880017280579, + "num_tokens": 417606549.0, + "step": 11454 + }, + { + "epoch": 2.127205199628598, + "grad_norm": 1.537772297859192, + "learning_rate": 1e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.9014172554016113, + "num_tokens": 417643571.0, + "step": 11455 + }, + { + "epoch": 2.1273909006499534, + "grad_norm": 1.6246399879455566, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8940168023109436, + "num_tokens": 417677907.0, + "step": 11456 + }, + { + "epoch": 2.127576601671309, + "grad_norm": 1.7295182943344116, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8839044570922852, + "num_tokens": 417711535.0, + "step": 11457 + }, + { + "epoch": 2.127762302692665, + "grad_norm": 1.4701409339904785, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.897097110748291, + "num_tokens": 417749131.0, + "step": 11458 + }, + { + "epoch": 2.1279480037140206, + "grad_norm": 1.5943260192871094, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8692445755004883, + "num_tokens": 417786742.0, + "step": 11459 + }, + { + "epoch": 2.128133704735376, + "grad_norm": 1.7530019283294678, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8766918778419495, + "num_tokens": 417826384.0, + "step": 11460 + }, + { + "epoch": 2.1283194057567316, + "grad_norm": 1.6776102781295776, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8918617367744446, + "num_tokens": 417859233.0, + "step": 11461 + }, + { + "epoch": 2.1285051067780874, + "grad_norm": 1.5113173723220825, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8844504356384277, + "num_tokens": 417901694.0, + "step": 11462 + }, + { + "epoch": 2.128690807799443, + "grad_norm": 1.589067816734314, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8854201436042786, + "num_tokens": 417937129.0, + "step": 11463 + }, + { + "epoch": 2.1288765088207984, + "grad_norm": 1.4721237421035767, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8937510251998901, + "num_tokens": 417978335.0, + "step": 11464 + }, + { + "epoch": 2.129062209842154, + "grad_norm": 1.646728515625, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8830490708351135, + "num_tokens": 418013901.0, + "step": 11465 + }, + { + "epoch": 2.12924791086351, + "grad_norm": 1.6736445426940918, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8837796449661255, + "num_tokens": 418048648.0, + "step": 11466 + }, + { + "epoch": 2.1294336118848656, + "grad_norm": 1.5240468978881836, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8862489461898804, + "num_tokens": 418091924.0, + "step": 11467 + }, + { + "epoch": 2.129619312906221, + "grad_norm": 1.578555703163147, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8768777251243591, + "num_tokens": 418135428.0, + "step": 11468 + }, + { + "epoch": 2.1298050139275766, + "grad_norm": 1.6646015644073486, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8815020322799683, + "num_tokens": 418173233.0, + "step": 11469 + }, + { + "epoch": 2.1299907149489323, + "grad_norm": 1.710281252861023, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8846359252929688, + "num_tokens": 418210369.0, + "step": 11470 + }, + { + "epoch": 2.130176415970288, + "grad_norm": 1.6255238056182861, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8873440027236938, + "num_tokens": 418246464.0, + "step": 11471 + }, + { + "epoch": 2.1303621169916434, + "grad_norm": 1.749591588973999, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8857362270355225, + "num_tokens": 418279912.0, + "step": 11472 + }, + { + "epoch": 2.130547818012999, + "grad_norm": 1.6877880096435547, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8849100470542908, + "num_tokens": 418313333.0, + "step": 11473 + }, + { + "epoch": 2.130733519034355, + "grad_norm": 1.5790443420410156, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8725705742835999, + "num_tokens": 418353950.0, + "step": 11474 + }, + { + "epoch": 2.13091922005571, + "grad_norm": 1.710299015045166, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8854653835296631, + "num_tokens": 418388054.0, + "step": 11475 + }, + { + "epoch": 2.131104921077066, + "grad_norm": 1.5731719732284546, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.8985881805419922, + "num_tokens": 418421973.0, + "step": 11476 + }, + { + "epoch": 2.1312906220984216, + "grad_norm": 1.5228077173233032, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8945482969284058, + "num_tokens": 418460743.0, + "step": 11477 + }, + { + "epoch": 2.1314763231197773, + "grad_norm": 1.6731481552124023, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8872951865196228, + "num_tokens": 418496542.0, + "step": 11478 + }, + { + "epoch": 2.1316620241411326, + "grad_norm": 1.8455350399017334, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.864851713180542, + "num_tokens": 418528877.0, + "step": 11479 + }, + { + "epoch": 2.1318477251624883, + "grad_norm": 1.506238341331482, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8908718824386597, + "num_tokens": 418566694.0, + "step": 11480 + }, + { + "epoch": 2.132033426183844, + "grad_norm": 1.714699625968933, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8858907222747803, + "num_tokens": 418598697.0, + "step": 11481 + }, + { + "epoch": 2.1322191272052, + "grad_norm": 1.4552994966506958, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8790655732154846, + "num_tokens": 418642525.0, + "step": 11482 + }, + { + "epoch": 2.132404828226555, + "grad_norm": 1.7353498935699463, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.881595253944397, + "num_tokens": 418675157.0, + "step": 11483 + }, + { + "epoch": 2.132590529247911, + "grad_norm": 1.758962869644165, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8855246305465698, + "num_tokens": 418705929.0, + "step": 11484 + }, + { + "epoch": 2.1327762302692665, + "grad_norm": 1.6587766408920288, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8915432691574097, + "num_tokens": 418740053.0, + "step": 11485 + }, + { + "epoch": 2.1329619312906223, + "grad_norm": 1.5632113218307495, + "learning_rate": 1e-06, + "loss": 0.2796, + "mean_token_accuracy": 0.8967074751853943, + "num_tokens": 418773750.0, + "step": 11486 + }, + { + "epoch": 2.1331476323119776, + "grad_norm": 1.6750268936157227, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8749819993972778, + "num_tokens": 418810565.0, + "step": 11487 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 1.9330750703811646, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8686888217926025, + "num_tokens": 418841592.0, + "step": 11488 + }, + { + "epoch": 2.133519034354689, + "grad_norm": 1.5467764139175415, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.881433367729187, + "num_tokens": 418879330.0, + "step": 11489 + }, + { + "epoch": 2.1337047353760448, + "grad_norm": 1.5444122552871704, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8879779577255249, + "num_tokens": 418914652.0, + "step": 11490 + }, + { + "epoch": 2.1338904363974, + "grad_norm": 1.5758534669876099, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8677123785018921, + "num_tokens": 418956571.0, + "step": 11491 + }, + { + "epoch": 2.134076137418756, + "grad_norm": 1.6003286838531494, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8997427225112915, + "num_tokens": 418992959.0, + "step": 11492 + }, + { + "epoch": 2.1342618384401115, + "grad_norm": 1.6624256372451782, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8795356750488281, + "num_tokens": 419029018.0, + "step": 11493 + }, + { + "epoch": 2.1344475394614673, + "grad_norm": 1.6434563398361206, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8856848478317261, + "num_tokens": 419064112.0, + "step": 11494 + }, + { + "epoch": 2.1346332404828225, + "grad_norm": 1.5785921812057495, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8902589082717896, + "num_tokens": 419099836.0, + "step": 11495 + }, + { + "epoch": 2.1348189415041783, + "grad_norm": 1.4769457578659058, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8948348164558411, + "num_tokens": 419138910.0, + "step": 11496 + }, + { + "epoch": 2.135004642525534, + "grad_norm": 1.7763129472732544, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8897416591644287, + "num_tokens": 419172791.0, + "step": 11497 + }, + { + "epoch": 2.1351903435468893, + "grad_norm": 1.4572328329086304, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.8989075422286987, + "num_tokens": 419208940.0, + "step": 11498 + }, + { + "epoch": 2.135376044568245, + "grad_norm": 1.6436734199523926, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8873989582061768, + "num_tokens": 419242955.0, + "step": 11499 + }, + { + "epoch": 2.1355617455896008, + "grad_norm": 1.4292417764663696, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8835318088531494, + "num_tokens": 419287732.0, + "step": 11500 + }, + { + "epoch": 2.1357474466109565, + "grad_norm": 1.8883129358291626, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8748149871826172, + "num_tokens": 419319207.0, + "step": 11501 + }, + { + "epoch": 2.135933147632312, + "grad_norm": 1.57831871509552, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8867263793945312, + "num_tokens": 419355204.0, + "step": 11502 + }, + { + "epoch": 2.1361188486536675, + "grad_norm": 1.7842010259628296, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8887904286384583, + "num_tokens": 419386179.0, + "step": 11503 + }, + { + "epoch": 2.1363045496750233, + "grad_norm": 1.6863555908203125, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8934489488601685, + "num_tokens": 419425594.0, + "step": 11504 + }, + { + "epoch": 2.136490250696379, + "grad_norm": 1.8797682523727417, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8818455338478088, + "num_tokens": 419454939.0, + "step": 11505 + }, + { + "epoch": 2.1366759517177343, + "grad_norm": 1.6658185720443726, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8738007545471191, + "num_tokens": 419491484.0, + "step": 11506 + }, + { + "epoch": 2.13686165273909, + "grad_norm": 1.4679861068725586, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8907731175422668, + "num_tokens": 419530977.0, + "step": 11507 + }, + { + "epoch": 2.1370473537604457, + "grad_norm": 1.655562400817871, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8929774761199951, + "num_tokens": 419562504.0, + "step": 11508 + }, + { + "epoch": 2.1372330547818015, + "grad_norm": 1.5531103610992432, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8902376294136047, + "num_tokens": 419601245.0, + "step": 11509 + }, + { + "epoch": 2.1374187558031568, + "grad_norm": 1.7033599615097046, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8763869404792786, + "num_tokens": 419637248.0, + "step": 11510 + }, + { + "epoch": 2.1376044568245125, + "grad_norm": 1.539110779762268, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.886040985584259, + "num_tokens": 419679292.0, + "step": 11511 + }, + { + "epoch": 2.1377901578458682, + "grad_norm": 1.5583258867263794, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8918768167495728, + "num_tokens": 419716649.0, + "step": 11512 + }, + { + "epoch": 2.137975858867224, + "grad_norm": 1.5483291149139404, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8845324516296387, + "num_tokens": 419754218.0, + "step": 11513 + }, + { + "epoch": 2.1381615598885793, + "grad_norm": 1.5553414821624756, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8891674280166626, + "num_tokens": 419790823.0, + "step": 11514 + }, + { + "epoch": 2.138347260909935, + "grad_norm": 1.536144495010376, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8848758935928345, + "num_tokens": 419830249.0, + "step": 11515 + }, + { + "epoch": 2.1385329619312907, + "grad_norm": 1.5706852674484253, + "learning_rate": 1e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9025688171386719, + "num_tokens": 419863032.0, + "step": 11516 + }, + { + "epoch": 2.1387186629526465, + "grad_norm": 1.7175955772399902, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.882011353969574, + "num_tokens": 419900450.0, + "step": 11517 + }, + { + "epoch": 2.1389043639740017, + "grad_norm": 1.57194983959198, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8912469148635864, + "num_tokens": 419938313.0, + "step": 11518 + }, + { + "epoch": 2.1390900649953575, + "grad_norm": 1.5871621370315552, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.884509265422821, + "num_tokens": 419974246.0, + "step": 11519 + }, + { + "epoch": 2.139275766016713, + "grad_norm": 1.6386111974716187, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8909808993339539, + "num_tokens": 420004651.0, + "step": 11520 + }, + { + "epoch": 2.1394614670380685, + "grad_norm": 1.7426444292068481, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8909128904342651, + "num_tokens": 420042576.0, + "step": 11521 + }, + { + "epoch": 2.1396471680594242, + "grad_norm": 1.6504566669464111, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8950023651123047, + "num_tokens": 420076205.0, + "step": 11522 + }, + { + "epoch": 2.13983286908078, + "grad_norm": 1.5484182834625244, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.887900173664093, + "num_tokens": 420112176.0, + "step": 11523 + }, + { + "epoch": 2.1400185701021357, + "grad_norm": 1.636997938156128, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8830994963645935, + "num_tokens": 420145723.0, + "step": 11524 + }, + { + "epoch": 2.140204271123491, + "grad_norm": 1.5682333707809448, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8934851884841919, + "num_tokens": 420182059.0, + "step": 11525 + }, + { + "epoch": 2.1403899721448467, + "grad_norm": 1.5483489036560059, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8776987195014954, + "num_tokens": 420222092.0, + "step": 11526 + }, + { + "epoch": 2.1405756731662025, + "grad_norm": 1.6224371194839478, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8780897855758667, + "num_tokens": 420261429.0, + "step": 11527 + }, + { + "epoch": 2.140761374187558, + "grad_norm": 1.6587703227996826, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8917597532272339, + "num_tokens": 420294470.0, + "step": 11528 + }, + { + "epoch": 2.1409470752089135, + "grad_norm": 1.69428551197052, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8953832387924194, + "num_tokens": 420325829.0, + "step": 11529 + }, + { + "epoch": 2.141132776230269, + "grad_norm": 1.5090233087539673, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.8982970714569092, + "num_tokens": 420364535.0, + "step": 11530 + }, + { + "epoch": 2.141318477251625, + "grad_norm": 1.4705874919891357, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8914452195167542, + "num_tokens": 420402782.0, + "step": 11531 + }, + { + "epoch": 2.1415041782729807, + "grad_norm": 1.5096428394317627, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8909119963645935, + "num_tokens": 420442562.0, + "step": 11532 + }, + { + "epoch": 2.141689879294336, + "grad_norm": 1.572630763053894, + "learning_rate": 1e-06, + "loss": 0.2663, + "mean_token_accuracy": 0.9025446176528931, + "num_tokens": 420476856.0, + "step": 11533 + }, + { + "epoch": 2.1418755803156917, + "grad_norm": 1.6994388103485107, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8859765529632568, + "num_tokens": 420511509.0, + "step": 11534 + }, + { + "epoch": 2.1420612813370474, + "grad_norm": 1.4393032789230347, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.894966721534729, + "num_tokens": 420553414.0, + "step": 11535 + }, + { + "epoch": 2.142246982358403, + "grad_norm": 1.7273319959640503, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8880220651626587, + "num_tokens": 420586852.0, + "step": 11536 + }, + { + "epoch": 2.1424326833797585, + "grad_norm": 1.7240346670150757, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8986930251121521, + "num_tokens": 420621843.0, + "step": 11537 + }, + { + "epoch": 2.142618384401114, + "grad_norm": 1.5880944728851318, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8842698931694031, + "num_tokens": 420660619.0, + "step": 11538 + }, + { + "epoch": 2.14280408542247, + "grad_norm": 1.7690476179122925, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8696523904800415, + "num_tokens": 420694405.0, + "step": 11539 + }, + { + "epoch": 2.1429897864438257, + "grad_norm": 1.5590167045593262, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8833513259887695, + "num_tokens": 420731688.0, + "step": 11540 + }, + { + "epoch": 2.143175487465181, + "grad_norm": 1.5680603981018066, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8849059343338013, + "num_tokens": 420768431.0, + "step": 11541 + }, + { + "epoch": 2.1433611884865367, + "grad_norm": 1.6336268186569214, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8962135314941406, + "num_tokens": 420803944.0, + "step": 11542 + }, + { + "epoch": 2.1435468895078924, + "grad_norm": 1.73829984664917, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8915418386459351, + "num_tokens": 420833585.0, + "step": 11543 + }, + { + "epoch": 2.1437325905292477, + "grad_norm": 1.6059094667434692, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8938173055648804, + "num_tokens": 420868275.0, + "step": 11544 + }, + { + "epoch": 2.1439182915506034, + "grad_norm": 1.6935789585113525, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8807403445243835, + "num_tokens": 420904558.0, + "step": 11545 + }, + { + "epoch": 2.144103992571959, + "grad_norm": 1.5278980731964111, + "learning_rate": 1e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.8959860801696777, + "num_tokens": 420940487.0, + "step": 11546 + }, + { + "epoch": 2.144289693593315, + "grad_norm": 1.364801049232483, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.893044114112854, + "num_tokens": 420988686.0, + "step": 11547 + }, + { + "epoch": 2.14447539461467, + "grad_norm": 1.4302443265914917, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8908478021621704, + "num_tokens": 421029316.0, + "step": 11548 + }, + { + "epoch": 2.144661095636026, + "grad_norm": 1.6093523502349854, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8847472667694092, + "num_tokens": 421068192.0, + "step": 11549 + }, + { + "epoch": 2.1448467966573816, + "grad_norm": 1.5268278121948242, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9003639221191406, + "num_tokens": 421103778.0, + "step": 11550 + }, + { + "epoch": 2.1450324976787374, + "grad_norm": 1.513283133506775, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.8999278545379639, + "num_tokens": 421139517.0, + "step": 11551 + }, + { + "epoch": 2.1452181987000927, + "grad_norm": 1.4566705226898193, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8796778917312622, + "num_tokens": 421185665.0, + "step": 11552 + }, + { + "epoch": 2.1454038997214484, + "grad_norm": 1.7142060995101929, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8814470767974854, + "num_tokens": 421218648.0, + "step": 11553 + }, + { + "epoch": 2.145589600742804, + "grad_norm": 1.717973232269287, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8809338212013245, + "num_tokens": 421253494.0, + "step": 11554 + }, + { + "epoch": 2.14577530176416, + "grad_norm": 1.4194090366363525, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.8952284455299377, + "num_tokens": 421294965.0, + "step": 11555 + }, + { + "epoch": 2.145961002785515, + "grad_norm": 1.6824203729629517, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8904356360435486, + "num_tokens": 421326864.0, + "step": 11556 + }, + { + "epoch": 2.146146703806871, + "grad_norm": 1.5868171453475952, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8898435831069946, + "num_tokens": 421366486.0, + "step": 11557 + }, + { + "epoch": 2.1463324048282266, + "grad_norm": 1.5698888301849365, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.886621356010437, + "num_tokens": 421406009.0, + "step": 11558 + }, + { + "epoch": 2.1465181058495824, + "grad_norm": 1.7863855361938477, + "learning_rate": 1e-06, + "loss": 0.2728, + "mean_token_accuracy": 0.9001466035842896, + "num_tokens": 421433882.0, + "step": 11559 + }, + { + "epoch": 2.1467038068709376, + "grad_norm": 1.613714337348938, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.875381350517273, + "num_tokens": 421473869.0, + "step": 11560 + }, + { + "epoch": 2.1468895078922934, + "grad_norm": 1.6365735530853271, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8946658372879028, + "num_tokens": 421508178.0, + "step": 11561 + }, + { + "epoch": 2.147075208913649, + "grad_norm": 1.635088324546814, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8815109133720398, + "num_tokens": 421545888.0, + "step": 11562 + }, + { + "epoch": 2.147260909935005, + "grad_norm": 1.7407422065734863, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8878506422042847, + "num_tokens": 421578759.0, + "step": 11563 + }, + { + "epoch": 2.14744661095636, + "grad_norm": 1.5677639245986938, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8829442262649536, + "num_tokens": 421615617.0, + "step": 11564 + }, + { + "epoch": 2.147632311977716, + "grad_norm": 1.7083284854888916, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8837503790855408, + "num_tokens": 421650155.0, + "step": 11565 + }, + { + "epoch": 2.1478180129990716, + "grad_norm": 1.761755347251892, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8738884925842285, + "num_tokens": 421684196.0, + "step": 11566 + }, + { + "epoch": 2.148003714020427, + "grad_norm": 1.5514562129974365, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8914476633071899, + "num_tokens": 421722468.0, + "step": 11567 + }, + { + "epoch": 2.1481894150417826, + "grad_norm": 1.5174384117126465, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8834068179130554, + "num_tokens": 421761030.0, + "step": 11568 + }, + { + "epoch": 2.1483751160631384, + "grad_norm": 1.7307440042495728, + "learning_rate": 1e-06, + "loss": 0.2632, + "mean_token_accuracy": 0.9016695022583008, + "num_tokens": 421790987.0, + "step": 11569 + }, + { + "epoch": 2.148560817084494, + "grad_norm": 1.647719144821167, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8843713998794556, + "num_tokens": 421825384.0, + "step": 11570 + }, + { + "epoch": 2.1487465181058494, + "grad_norm": 1.5268232822418213, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8917677402496338, + "num_tokens": 421862741.0, + "step": 11571 + }, + { + "epoch": 2.148932219127205, + "grad_norm": 1.5434709787368774, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8861633539199829, + "num_tokens": 421902076.0, + "step": 11572 + }, + { + "epoch": 2.149117920148561, + "grad_norm": 1.6602238416671753, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8838889002799988, + "num_tokens": 421935866.0, + "step": 11573 + }, + { + "epoch": 2.1493036211699166, + "grad_norm": 1.7331775426864624, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8821758031845093, + "num_tokens": 421969518.0, + "step": 11574 + }, + { + "epoch": 2.149489322191272, + "grad_norm": 1.6353142261505127, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8796954154968262, + "num_tokens": 422006345.0, + "step": 11575 + }, + { + "epoch": 2.1496750232126276, + "grad_norm": 1.590549349784851, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8859281539916992, + "num_tokens": 422040935.0, + "step": 11576 + }, + { + "epoch": 2.1498607242339833, + "grad_norm": 1.5483977794647217, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8900970220565796, + "num_tokens": 422080055.0, + "step": 11577 + }, + { + "epoch": 2.150046425255339, + "grad_norm": 1.4941556453704834, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8923494815826416, + "num_tokens": 422119420.0, + "step": 11578 + }, + { + "epoch": 2.1502321262766944, + "grad_norm": 1.8012021780014038, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8708581924438477, + "num_tokens": 422151623.0, + "step": 11579 + }, + { + "epoch": 2.15041782729805, + "grad_norm": 1.5956060886383057, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8842533826828003, + "num_tokens": 422190108.0, + "step": 11580 + }, + { + "epoch": 2.150603528319406, + "grad_norm": 1.4349161386489868, + "learning_rate": 1e-06, + "loss": 0.2708, + "mean_token_accuracy": 0.9019489884376526, + "num_tokens": 422227711.0, + "step": 11581 + }, + { + "epoch": 2.1507892293407616, + "grad_norm": 1.5637458562850952, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8847188949584961, + "num_tokens": 422265358.0, + "step": 11582 + }, + { + "epoch": 2.150974930362117, + "grad_norm": 1.641236662864685, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8866305947303772, + "num_tokens": 422302988.0, + "step": 11583 + }, + { + "epoch": 2.1511606313834726, + "grad_norm": 1.5645473003387451, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8869972825050354, + "num_tokens": 422340379.0, + "step": 11584 + }, + { + "epoch": 2.1513463324048283, + "grad_norm": 1.6568241119384766, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8875226378440857, + "num_tokens": 422372600.0, + "step": 11585 + }, + { + "epoch": 2.151532033426184, + "grad_norm": 1.522089958190918, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8874223828315735, + "num_tokens": 422410013.0, + "step": 11586 + }, + { + "epoch": 2.1517177344475393, + "grad_norm": 1.7699147462844849, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8776059746742249, + "num_tokens": 422437689.0, + "step": 11587 + }, + { + "epoch": 2.151903435468895, + "grad_norm": 1.5961816310882568, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.884169340133667, + "num_tokens": 422476110.0, + "step": 11588 + }, + { + "epoch": 2.152089136490251, + "grad_norm": 1.5824238061904907, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8907410502433777, + "num_tokens": 422513576.0, + "step": 11589 + }, + { + "epoch": 2.152274837511606, + "grad_norm": 1.6456055641174316, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8797714710235596, + "num_tokens": 422549347.0, + "step": 11590 + }, + { + "epoch": 2.152460538532962, + "grad_norm": 1.6488832235336304, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8829119205474854, + "num_tokens": 422585219.0, + "step": 11591 + }, + { + "epoch": 2.1526462395543176, + "grad_norm": 1.5032132863998413, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.888746976852417, + "num_tokens": 422625128.0, + "step": 11592 + }, + { + "epoch": 2.1528319405756733, + "grad_norm": 1.609438419342041, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8909575343132019, + "num_tokens": 422660574.0, + "step": 11593 + }, + { + "epoch": 2.153017641597029, + "grad_norm": 1.6417721509933472, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8956900835037231, + "num_tokens": 422693322.0, + "step": 11594 + }, + { + "epoch": 2.1532033426183843, + "grad_norm": 1.5332754850387573, + "learning_rate": 1e-06, + "loss": 0.268, + "mean_token_accuracy": 0.8990802764892578, + "num_tokens": 422727312.0, + "step": 11595 + }, + { + "epoch": 2.15338904363974, + "grad_norm": 1.605866551399231, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.872657299041748, + "num_tokens": 422766654.0, + "step": 11596 + }, + { + "epoch": 2.1535747446610958, + "grad_norm": 1.6496317386627197, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8940702676773071, + "num_tokens": 422800309.0, + "step": 11597 + }, + { + "epoch": 2.153760445682451, + "grad_norm": 1.7381923198699951, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8920983672142029, + "num_tokens": 422829923.0, + "step": 11598 + }, + { + "epoch": 2.153946146703807, + "grad_norm": 1.540558099746704, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8921014666557312, + "num_tokens": 422866643.0, + "step": 11599 + }, + { + "epoch": 2.1541318477251625, + "grad_norm": 1.8056704998016357, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8701127767562866, + "num_tokens": 422897662.0, + "step": 11600 + }, + { + "epoch": 2.1543175487465183, + "grad_norm": 1.664410948753357, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8757058382034302, + "num_tokens": 422937119.0, + "step": 11601 + }, + { + "epoch": 2.1545032497678736, + "grad_norm": 1.6540213823318481, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.893765389919281, + "num_tokens": 422969424.0, + "step": 11602 + }, + { + "epoch": 2.1546889507892293, + "grad_norm": 1.6188615560531616, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8670395016670227, + "num_tokens": 423009844.0, + "step": 11603 + }, + { + "epoch": 2.154874651810585, + "grad_norm": 1.593781590461731, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8768325448036194, + "num_tokens": 423049726.0, + "step": 11604 + }, + { + "epoch": 2.1550603528319408, + "grad_norm": 1.5687761306762695, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8943105340003967, + "num_tokens": 423080464.0, + "step": 11605 + }, + { + "epoch": 2.155246053853296, + "grad_norm": 1.5758857727050781, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8959604501724243, + "num_tokens": 423116163.0, + "step": 11606 + }, + { + "epoch": 2.1554317548746518, + "grad_norm": 1.7867306470870972, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8851838111877441, + "num_tokens": 423144016.0, + "step": 11607 + }, + { + "epoch": 2.1556174558960075, + "grad_norm": 1.5411049127578735, + "learning_rate": 1e-06, + "loss": 0.2663, + "mean_token_accuracy": 0.9037556648254395, + "num_tokens": 423181620.0, + "step": 11608 + }, + { + "epoch": 2.1558031569173632, + "grad_norm": 1.7000278234481812, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8780957460403442, + "num_tokens": 423215752.0, + "step": 11609 + }, + { + "epoch": 2.1559888579387185, + "grad_norm": 1.5562611818313599, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8806982636451721, + "num_tokens": 423254484.0, + "step": 11610 + }, + { + "epoch": 2.1561745589600743, + "grad_norm": 1.5286706686019897, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8914222121238708, + "num_tokens": 423293625.0, + "step": 11611 + }, + { + "epoch": 2.15636025998143, + "grad_norm": 1.488459587097168, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8874447345733643, + "num_tokens": 423337445.0, + "step": 11612 + }, + { + "epoch": 2.1565459610027853, + "grad_norm": 1.6098865270614624, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8965806365013123, + "num_tokens": 423372551.0, + "step": 11613 + }, + { + "epoch": 2.156731662024141, + "grad_norm": 1.6802964210510254, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8794957399368286, + "num_tokens": 423406845.0, + "step": 11614 + }, + { + "epoch": 2.1569173630454967, + "grad_norm": 1.5048253536224365, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8659195899963379, + "num_tokens": 423450960.0, + "step": 11615 + }, + { + "epoch": 2.1571030640668525, + "grad_norm": 1.6825664043426514, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8955514430999756, + "num_tokens": 423486343.0, + "step": 11616 + }, + { + "epoch": 2.157288765088208, + "grad_norm": 1.6196750402450562, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8884821534156799, + "num_tokens": 423524011.0, + "step": 11617 + }, + { + "epoch": 2.1574744661095635, + "grad_norm": 1.7338237762451172, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8829435110092163, + "num_tokens": 423558035.0, + "step": 11618 + }, + { + "epoch": 2.1576601671309192, + "grad_norm": 1.6525230407714844, + "learning_rate": 1e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.901471734046936, + "num_tokens": 423590995.0, + "step": 11619 + }, + { + "epoch": 2.157845868152275, + "grad_norm": 1.5397615432739258, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8871777057647705, + "num_tokens": 423635661.0, + "step": 11620 + }, + { + "epoch": 2.1580315691736303, + "grad_norm": 1.4645106792449951, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8805472254753113, + "num_tokens": 423680528.0, + "step": 11621 + }, + { + "epoch": 2.158217270194986, + "grad_norm": 1.7401251792907715, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8794444799423218, + "num_tokens": 423712028.0, + "step": 11622 + }, + { + "epoch": 2.1584029712163417, + "grad_norm": 1.615801215171814, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8850472569465637, + "num_tokens": 423750358.0, + "step": 11623 + }, + { + "epoch": 2.1585886722376975, + "grad_norm": 1.54196035861969, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8842474818229675, + "num_tokens": 423791455.0, + "step": 11624 + }, + { + "epoch": 2.1587743732590527, + "grad_norm": 1.6670695543289185, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8868524432182312, + "num_tokens": 423826180.0, + "step": 11625 + }, + { + "epoch": 2.1589600742804085, + "grad_norm": 1.5480427742004395, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8932383060455322, + "num_tokens": 423863086.0, + "step": 11626 + }, + { + "epoch": 2.159145775301764, + "grad_norm": 1.7758129835128784, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.86944180727005, + "num_tokens": 423895274.0, + "step": 11627 + }, + { + "epoch": 2.15933147632312, + "grad_norm": 1.5842578411102295, + "learning_rate": 1e-06, + "loss": 0.2744, + "mean_token_accuracy": 0.8999923467636108, + "num_tokens": 423926070.0, + "step": 11628 + }, + { + "epoch": 2.1595171773444752, + "grad_norm": 1.5211102962493896, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8943010568618774, + "num_tokens": 423962025.0, + "step": 11629 + }, + { + "epoch": 2.159702878365831, + "grad_norm": 1.711982011795044, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8821526765823364, + "num_tokens": 423995741.0, + "step": 11630 + }, + { + "epoch": 2.1598885793871867, + "grad_norm": 1.6747965812683105, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8700311183929443, + "num_tokens": 424031289.0, + "step": 11631 + }, + { + "epoch": 2.1600742804085424, + "grad_norm": 1.5772465467453003, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8854948878288269, + "num_tokens": 424072425.0, + "step": 11632 + }, + { + "epoch": 2.1602599814298977, + "grad_norm": 1.6464669704437256, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.8985206484794617, + "num_tokens": 424105831.0, + "step": 11633 + }, + { + "epoch": 2.1604456824512535, + "grad_norm": 1.5422697067260742, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8932091593742371, + "num_tokens": 424144744.0, + "step": 11634 + }, + { + "epoch": 2.160631383472609, + "grad_norm": 1.744969129562378, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.8983458876609802, + "num_tokens": 424179076.0, + "step": 11635 + }, + { + "epoch": 2.160817084493965, + "grad_norm": 1.742876648902893, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8875204920768738, + "num_tokens": 424208351.0, + "step": 11636 + }, + { + "epoch": 2.16100278551532, + "grad_norm": 1.5707052946090698, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8912796974182129, + "num_tokens": 424244369.0, + "step": 11637 + }, + { + "epoch": 2.161188486536676, + "grad_norm": 1.6558395624160767, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8816357851028442, + "num_tokens": 424280906.0, + "step": 11638 + }, + { + "epoch": 2.1613741875580317, + "grad_norm": 1.6946841478347778, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8824102878570557, + "num_tokens": 424313175.0, + "step": 11639 + }, + { + "epoch": 2.1615598885793874, + "grad_norm": 1.7381097078323364, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8681949377059937, + "num_tokens": 424349108.0, + "step": 11640 + }, + { + "epoch": 2.1617455896007427, + "grad_norm": 1.617057204246521, + "learning_rate": 1e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.9021020531654358, + "num_tokens": 424378908.0, + "step": 11641 + }, + { + "epoch": 2.1619312906220984, + "grad_norm": 1.4987742900848389, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8931882381439209, + "num_tokens": 424420224.0, + "step": 11642 + }, + { + "epoch": 2.162116991643454, + "grad_norm": 1.7192274332046509, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8874367475509644, + "num_tokens": 424454408.0, + "step": 11643 + }, + { + "epoch": 2.1623026926648095, + "grad_norm": 1.5981861352920532, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9004209041595459, + "num_tokens": 424489091.0, + "step": 11644 + }, + { + "epoch": 2.162488393686165, + "grad_norm": 1.6802308559417725, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8852317929267883, + "num_tokens": 424520675.0, + "step": 11645 + }, + { + "epoch": 2.162674094707521, + "grad_norm": 1.6655082702636719, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8868846893310547, + "num_tokens": 424554889.0, + "step": 11646 + }, + { + "epoch": 2.1628597957288767, + "grad_norm": 1.749413013458252, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8792867660522461, + "num_tokens": 424587693.0, + "step": 11647 + }, + { + "epoch": 2.163045496750232, + "grad_norm": 1.749191403388977, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8788552284240723, + "num_tokens": 424622847.0, + "step": 11648 + }, + { + "epoch": 2.1632311977715877, + "grad_norm": 1.552104115486145, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8946311473846436, + "num_tokens": 424660888.0, + "step": 11649 + }, + { + "epoch": 2.1634168987929434, + "grad_norm": 1.7345882654190063, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8779956102371216, + "num_tokens": 424698782.0, + "step": 11650 + }, + { + "epoch": 2.163602599814299, + "grad_norm": 1.6061522960662842, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8813693523406982, + "num_tokens": 424737013.0, + "step": 11651 + }, + { + "epoch": 2.1637883008356544, + "grad_norm": 1.6352952718734741, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8832225799560547, + "num_tokens": 424774294.0, + "step": 11652 + }, + { + "epoch": 2.16397400185701, + "grad_norm": 1.4899500608444214, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8965682983398438, + "num_tokens": 424810893.0, + "step": 11653 + }, + { + "epoch": 2.164159702878366, + "grad_norm": 1.5265554189682007, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8781245946884155, + "num_tokens": 424851736.0, + "step": 11654 + }, + { + "epoch": 2.1643454038997216, + "grad_norm": 1.6286613941192627, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8873728513717651, + "num_tokens": 424889603.0, + "step": 11655 + }, + { + "epoch": 2.164531104921077, + "grad_norm": 1.4990166425704956, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8902961015701294, + "num_tokens": 424932536.0, + "step": 11656 + }, + { + "epoch": 2.1647168059424327, + "grad_norm": 1.7878658771514893, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8928547501564026, + "num_tokens": 424965104.0, + "step": 11657 + }, + { + "epoch": 2.1649025069637884, + "grad_norm": 1.4333488941192627, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8824357390403748, + "num_tokens": 425008587.0, + "step": 11658 + }, + { + "epoch": 2.165088207985144, + "grad_norm": 1.6362884044647217, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.8978146314620972, + "num_tokens": 425043784.0, + "step": 11659 + }, + { + "epoch": 2.1652739090064994, + "grad_norm": 1.4551149606704712, + "learning_rate": 1e-06, + "loss": 0.2694, + "mean_token_accuracy": 0.9036165475845337, + "num_tokens": 425082145.0, + "step": 11660 + }, + { + "epoch": 2.165459610027855, + "grad_norm": 1.6318635940551758, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8869409561157227, + "num_tokens": 425119785.0, + "step": 11661 + }, + { + "epoch": 2.165645311049211, + "grad_norm": 1.682985782623291, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8791955709457397, + "num_tokens": 425157919.0, + "step": 11662 + }, + { + "epoch": 2.1658310120705666, + "grad_norm": 1.4494056701660156, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8942131400108337, + "num_tokens": 425196933.0, + "step": 11663 + }, + { + "epoch": 2.166016713091922, + "grad_norm": 1.4449660778045654, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8949636220932007, + "num_tokens": 425237479.0, + "step": 11664 + }, + { + "epoch": 2.1662024141132776, + "grad_norm": 1.8838297128677368, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8890553116798401, + "num_tokens": 425266948.0, + "step": 11665 + }, + { + "epoch": 2.1663881151346334, + "grad_norm": 1.703684687614441, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8818460702896118, + "num_tokens": 425303070.0, + "step": 11666 + }, + { + "epoch": 2.1665738161559887, + "grad_norm": 1.714500904083252, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8815773129463196, + "num_tokens": 425334948.0, + "step": 11667 + }, + { + "epoch": 2.1667595171773444, + "grad_norm": 1.779935359954834, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8806208372116089, + "num_tokens": 425368862.0, + "step": 11668 + }, + { + "epoch": 2.1669452181987, + "grad_norm": 1.8739304542541504, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8973034620285034, + "num_tokens": 425400276.0, + "step": 11669 + }, + { + "epoch": 2.167130919220056, + "grad_norm": 1.5970544815063477, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.8989113569259644, + "num_tokens": 425431726.0, + "step": 11670 + }, + { + "epoch": 2.167316620241411, + "grad_norm": 1.6938613653182983, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8866052031517029, + "num_tokens": 425462828.0, + "step": 11671 + }, + { + "epoch": 2.167502321262767, + "grad_norm": 1.5688914060592651, + "learning_rate": 1e-06, + "loss": 0.259, + "mean_token_accuracy": 0.9063381552696228, + "num_tokens": 425497919.0, + "step": 11672 + }, + { + "epoch": 2.1676880222841226, + "grad_norm": 1.4775354862213135, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.8977563977241516, + "num_tokens": 425538494.0, + "step": 11673 + }, + { + "epoch": 2.1678737233054783, + "grad_norm": 1.7879525423049927, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8944914937019348, + "num_tokens": 425574527.0, + "step": 11674 + }, + { + "epoch": 2.1680594243268336, + "grad_norm": 1.5279172658920288, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8994872570037842, + "num_tokens": 425611654.0, + "step": 11675 + }, + { + "epoch": 2.1682451253481894, + "grad_norm": 1.8691325187683105, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8727214336395264, + "num_tokens": 425641923.0, + "step": 11676 + }, + { + "epoch": 2.168430826369545, + "grad_norm": 1.842337965965271, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8712484836578369, + "num_tokens": 425676406.0, + "step": 11677 + }, + { + "epoch": 2.168616527390901, + "grad_norm": 1.5412973165512085, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8796687126159668, + "num_tokens": 425718816.0, + "step": 11678 + }, + { + "epoch": 2.168802228412256, + "grad_norm": 1.72353196144104, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8916699886322021, + "num_tokens": 425751555.0, + "step": 11679 + }, + { + "epoch": 2.168987929433612, + "grad_norm": 1.6666425466537476, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8739105463027954, + "num_tokens": 425791498.0, + "step": 11680 + }, + { + "epoch": 2.1691736304549676, + "grad_norm": 1.59476900100708, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8817763328552246, + "num_tokens": 425831494.0, + "step": 11681 + }, + { + "epoch": 2.1693593314763233, + "grad_norm": 1.6193214654922485, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8843878507614136, + "num_tokens": 425868243.0, + "step": 11682 + }, + { + "epoch": 2.1695450324976786, + "grad_norm": 1.7016723155975342, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8905380964279175, + "num_tokens": 425901967.0, + "step": 11683 + }, + { + "epoch": 2.1697307335190343, + "grad_norm": 1.717126727104187, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8885468244552612, + "num_tokens": 425934921.0, + "step": 11684 + }, + { + "epoch": 2.16991643454039, + "grad_norm": 1.43375825881958, + "learning_rate": 1e-06, + "loss": 0.2666, + "mean_token_accuracy": 0.9033254384994507, + "num_tokens": 425975910.0, + "step": 11685 + }, + { + "epoch": 2.170102135561746, + "grad_norm": 1.6402844190597534, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8927432298660278, + "num_tokens": 426012446.0, + "step": 11686 + }, + { + "epoch": 2.170287836583101, + "grad_norm": 1.7341417074203491, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8949915170669556, + "num_tokens": 426040186.0, + "step": 11687 + }, + { + "epoch": 2.170473537604457, + "grad_norm": 1.778031587600708, + "learning_rate": 1e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.9056894779205322, + "num_tokens": 426067563.0, + "step": 11688 + }, + { + "epoch": 2.1706592386258126, + "grad_norm": 1.7162214517593384, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.9006345272064209, + "num_tokens": 426100620.0, + "step": 11689 + }, + { + "epoch": 2.170844939647168, + "grad_norm": 1.7369790077209473, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8838307857513428, + "num_tokens": 426134398.0, + "step": 11690 + }, + { + "epoch": 2.1710306406685236, + "grad_norm": 1.6439310312271118, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.8937044143676758, + "num_tokens": 426171972.0, + "step": 11691 + }, + { + "epoch": 2.1712163416898793, + "grad_norm": 1.8146982192993164, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8864749670028687, + "num_tokens": 426204487.0, + "step": 11692 + }, + { + "epoch": 2.171402042711235, + "grad_norm": 1.6455363035202026, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8805012702941895, + "num_tokens": 426242872.0, + "step": 11693 + }, + { + "epoch": 2.1715877437325903, + "grad_norm": 1.6687582731246948, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8956491947174072, + "num_tokens": 426275036.0, + "step": 11694 + }, + { + "epoch": 2.171773444753946, + "grad_norm": 1.6012595891952515, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8937138319015503, + "num_tokens": 426310893.0, + "step": 11695 + }, + { + "epoch": 2.171959145775302, + "grad_norm": 1.577722430229187, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8912728428840637, + "num_tokens": 426345004.0, + "step": 11696 + }, + { + "epoch": 2.1721448467966575, + "grad_norm": 1.660114049911499, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8869479298591614, + "num_tokens": 426378749.0, + "step": 11697 + }, + { + "epoch": 2.172330547818013, + "grad_norm": 1.672708511352539, + "learning_rate": 1e-06, + "loss": 0.2684, + "mean_token_accuracy": 0.9026724100112915, + "num_tokens": 426408712.0, + "step": 11698 + }, + { + "epoch": 2.1725162488393686, + "grad_norm": 1.6908608675003052, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8821679353713989, + "num_tokens": 426450113.0, + "step": 11699 + }, + { + "epoch": 2.1727019498607243, + "grad_norm": 1.8098762035369873, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8981646299362183, + "num_tokens": 426478398.0, + "step": 11700 + }, + { + "epoch": 2.17288765088208, + "grad_norm": 1.5032203197479248, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.8997943997383118, + "num_tokens": 426515767.0, + "step": 11701 + }, + { + "epoch": 2.1730733519034353, + "grad_norm": 1.6242895126342773, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8888896107673645, + "num_tokens": 426552622.0, + "step": 11702 + }, + { + "epoch": 2.173259052924791, + "grad_norm": 1.5094316005706787, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.885402262210846, + "num_tokens": 426589558.0, + "step": 11703 + }, + { + "epoch": 2.173444753946147, + "grad_norm": 1.5427626371383667, + "learning_rate": 1e-06, + "loss": 0.276, + "mean_token_accuracy": 0.8994195461273193, + "num_tokens": 426625920.0, + "step": 11704 + }, + { + "epoch": 2.1736304549675025, + "grad_norm": 1.7535851001739502, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8827057480812073, + "num_tokens": 426662299.0, + "step": 11705 + }, + { + "epoch": 2.173816155988858, + "grad_norm": 1.4675804376602173, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8905301690101624, + "num_tokens": 426702796.0, + "step": 11706 + }, + { + "epoch": 2.1740018570102135, + "grad_norm": 1.5209605693817139, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8813165426254272, + "num_tokens": 426747866.0, + "step": 11707 + }, + { + "epoch": 2.1741875580315693, + "grad_norm": 1.675304889678955, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.889885663986206, + "num_tokens": 426783270.0, + "step": 11708 + }, + { + "epoch": 2.174373259052925, + "grad_norm": 1.7546712160110474, + "learning_rate": 1e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9005938172340393, + "num_tokens": 426811504.0, + "step": 11709 + }, + { + "epoch": 2.1745589600742803, + "grad_norm": 1.8130624294281006, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.875577986240387, + "num_tokens": 426852238.0, + "step": 11710 + }, + { + "epoch": 2.174744661095636, + "grad_norm": 1.6514683961868286, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8930282592773438, + "num_tokens": 426886881.0, + "step": 11711 + }, + { + "epoch": 2.1749303621169918, + "grad_norm": 1.7328225374221802, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8896510004997253, + "num_tokens": 426918303.0, + "step": 11712 + }, + { + "epoch": 2.175116063138347, + "grad_norm": 1.5142731666564941, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8947310447692871, + "num_tokens": 426954863.0, + "step": 11713 + }, + { + "epoch": 2.1753017641597028, + "grad_norm": 1.8427982330322266, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8778423070907593, + "num_tokens": 426988652.0, + "step": 11714 + }, + { + "epoch": 2.1754874651810585, + "grad_norm": 1.6527262926101685, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8781801462173462, + "num_tokens": 427023034.0, + "step": 11715 + }, + { + "epoch": 2.1756731662024142, + "grad_norm": 1.7824910879135132, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8759378790855408, + "num_tokens": 427056823.0, + "step": 11716 + }, + { + "epoch": 2.1758588672237695, + "grad_norm": 1.6162116527557373, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8944150805473328, + "num_tokens": 427089026.0, + "step": 11717 + }, + { + "epoch": 2.1760445682451253, + "grad_norm": 1.6470668315887451, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8931489586830139, + "num_tokens": 427120966.0, + "step": 11718 + }, + { + "epoch": 2.176230269266481, + "grad_norm": 1.7881149053573608, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8746484518051147, + "num_tokens": 427153166.0, + "step": 11719 + }, + { + "epoch": 2.1764159702878367, + "grad_norm": 1.609789490699768, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8829513788223267, + "num_tokens": 427192031.0, + "step": 11720 + }, + { + "epoch": 2.176601671309192, + "grad_norm": 1.6930296421051025, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8702969551086426, + "num_tokens": 427232213.0, + "step": 11721 + }, + { + "epoch": 2.1767873723305478, + "grad_norm": 1.6671503782272339, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8851615190505981, + "num_tokens": 427267009.0, + "step": 11722 + }, + { + "epoch": 2.1769730733519035, + "grad_norm": 1.6054304838180542, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8866472244262695, + "num_tokens": 427305169.0, + "step": 11723 + }, + { + "epoch": 2.177158774373259, + "grad_norm": 1.7128517627716064, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8654429912567139, + "num_tokens": 427343397.0, + "step": 11724 + }, + { + "epoch": 2.1773444753946145, + "grad_norm": 1.5337271690368652, + "learning_rate": 1e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.8989295959472656, + "num_tokens": 427377953.0, + "step": 11725 + }, + { + "epoch": 2.1775301764159702, + "grad_norm": 1.6341320276260376, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.894964337348938, + "num_tokens": 427411792.0, + "step": 11726 + }, + { + "epoch": 2.177715877437326, + "grad_norm": 1.724169373512268, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8826813697814941, + "num_tokens": 427446041.0, + "step": 11727 + }, + { + "epoch": 2.1779015784586817, + "grad_norm": 1.50757896900177, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8965575695037842, + "num_tokens": 427483516.0, + "step": 11728 + }, + { + "epoch": 2.178087279480037, + "grad_norm": 1.604061245918274, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.888788104057312, + "num_tokens": 427517248.0, + "step": 11729 + }, + { + "epoch": 2.1782729805013927, + "grad_norm": 1.5170844793319702, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8917572498321533, + "num_tokens": 427555486.0, + "step": 11730 + }, + { + "epoch": 2.1784586815227485, + "grad_norm": 1.5987811088562012, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.88844895362854, + "num_tokens": 427593218.0, + "step": 11731 + }, + { + "epoch": 2.178644382544104, + "grad_norm": 1.662213921546936, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8892009854316711, + "num_tokens": 427627216.0, + "step": 11732 + }, + { + "epoch": 2.1788300835654595, + "grad_norm": 1.6368111371994019, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8815426826477051, + "num_tokens": 427665645.0, + "step": 11733 + }, + { + "epoch": 2.179015784586815, + "grad_norm": 1.473596215248108, + "learning_rate": 1e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.9064884185791016, + "num_tokens": 427702347.0, + "step": 11734 + }, + { + "epoch": 2.179201485608171, + "grad_norm": 1.7301148176193237, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8934298753738403, + "num_tokens": 427736745.0, + "step": 11735 + }, + { + "epoch": 2.1793871866295262, + "grad_norm": 1.7462382316589355, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8891330361366272, + "num_tokens": 427770685.0, + "step": 11736 + }, + { + "epoch": 2.179572887650882, + "grad_norm": 1.5845240354537964, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.883055567741394, + "num_tokens": 427805729.0, + "step": 11737 + }, + { + "epoch": 2.1797585886722377, + "grad_norm": 1.5461057424545288, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8850754499435425, + "num_tokens": 427842809.0, + "step": 11738 + }, + { + "epoch": 2.1799442896935934, + "grad_norm": 1.5899417400360107, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8745945692062378, + "num_tokens": 427880718.0, + "step": 11739 + }, + { + "epoch": 2.1801299907149487, + "grad_norm": 1.5463950634002686, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8943333625793457, + "num_tokens": 427920585.0, + "step": 11740 + }, + { + "epoch": 2.1803156917363045, + "grad_norm": 1.5657193660736084, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8926247358322144, + "num_tokens": 427957625.0, + "step": 11741 + }, + { + "epoch": 2.18050139275766, + "grad_norm": 2.0505950450897217, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8844548463821411, + "num_tokens": 427990421.0, + "step": 11742 + }, + { + "epoch": 2.180687093779016, + "grad_norm": 1.5940316915512085, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8842825889587402, + "num_tokens": 428028178.0, + "step": 11743 + }, + { + "epoch": 2.180872794800371, + "grad_norm": 1.6715524196624756, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8819403648376465, + "num_tokens": 428065838.0, + "step": 11744 + }, + { + "epoch": 2.181058495821727, + "grad_norm": 1.5741596221923828, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8834694623947144, + "num_tokens": 428102762.0, + "step": 11745 + }, + { + "epoch": 2.1812441968430827, + "grad_norm": 1.5197573900222778, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.88232421875, + "num_tokens": 428142479.0, + "step": 11746 + }, + { + "epoch": 2.1814298978644384, + "grad_norm": 1.5127959251403809, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8952518105506897, + "num_tokens": 428181026.0, + "step": 11747 + }, + { + "epoch": 2.1816155988857937, + "grad_norm": 1.5889713764190674, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8793756365776062, + "num_tokens": 428219587.0, + "step": 11748 + }, + { + "epoch": 2.1818012999071494, + "grad_norm": 1.6602809429168701, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8936415910720825, + "num_tokens": 428253899.0, + "step": 11749 + }, + { + "epoch": 2.181987000928505, + "grad_norm": 1.6245003938674927, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8890557289123535, + "num_tokens": 428286868.0, + "step": 11750 + }, + { + "epoch": 2.182172701949861, + "grad_norm": 1.5163902044296265, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8922792673110962, + "num_tokens": 428325801.0, + "step": 11751 + }, + { + "epoch": 2.182358402971216, + "grad_norm": 1.4603610038757324, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8970401287078857, + "num_tokens": 428367472.0, + "step": 11752 + }, + { + "epoch": 2.182544103992572, + "grad_norm": 1.7387691736221313, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8855333924293518, + "num_tokens": 428401369.0, + "step": 11753 + }, + { + "epoch": 2.1827298050139277, + "grad_norm": 1.5489555597305298, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8821021318435669, + "num_tokens": 428441448.0, + "step": 11754 + }, + { + "epoch": 2.1829155060352834, + "grad_norm": 1.6445026397705078, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8884823322296143, + "num_tokens": 428476642.0, + "step": 11755 + }, + { + "epoch": 2.1831012070566387, + "grad_norm": 1.7331329584121704, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8758496046066284, + "num_tokens": 428511596.0, + "step": 11756 + }, + { + "epoch": 2.1832869080779944, + "grad_norm": 1.7517359256744385, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8845999836921692, + "num_tokens": 428545171.0, + "step": 11757 + }, + { + "epoch": 2.18347260909935, + "grad_norm": 1.6542624235153198, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.8955754041671753, + "num_tokens": 428578796.0, + "step": 11758 + }, + { + "epoch": 2.1836583101207054, + "grad_norm": 1.6732587814331055, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8855034112930298, + "num_tokens": 428613375.0, + "step": 11759 + }, + { + "epoch": 2.183844011142061, + "grad_norm": 1.5315862894058228, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8914164304733276, + "num_tokens": 428654797.0, + "step": 11760 + }, + { + "epoch": 2.184029712163417, + "grad_norm": 1.4842231273651123, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.881377637386322, + "num_tokens": 428698357.0, + "step": 11761 + }, + { + "epoch": 2.1842154131847726, + "grad_norm": 1.6943163871765137, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8863683938980103, + "num_tokens": 428731586.0, + "step": 11762 + }, + { + "epoch": 2.1844011142061284, + "grad_norm": 1.6805484294891357, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8896511793136597, + "num_tokens": 428764772.0, + "step": 11763 + }, + { + "epoch": 2.1845868152274837, + "grad_norm": 1.5715216398239136, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8970967531204224, + "num_tokens": 428799194.0, + "step": 11764 + }, + { + "epoch": 2.1847725162488394, + "grad_norm": 1.6157782077789307, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8765115737915039, + "num_tokens": 428839982.0, + "step": 11765 + }, + { + "epoch": 2.184958217270195, + "grad_norm": 1.5161713361740112, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.9000276327133179, + "num_tokens": 428876119.0, + "step": 11766 + }, + { + "epoch": 2.1851439182915504, + "grad_norm": 1.7150412797927856, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8867324590682983, + "num_tokens": 428907746.0, + "step": 11767 + }, + { + "epoch": 2.185329619312906, + "grad_norm": 1.5534123182296753, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.895954430103302, + "num_tokens": 428941711.0, + "step": 11768 + }, + { + "epoch": 2.185515320334262, + "grad_norm": 1.7787292003631592, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8878898024559021, + "num_tokens": 428969911.0, + "step": 11769 + }, + { + "epoch": 2.1857010213556176, + "grad_norm": 1.6413453817367554, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8841350674629211, + "num_tokens": 429006396.0, + "step": 11770 + }, + { + "epoch": 2.185886722376973, + "grad_norm": 1.5476336479187012, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8901940584182739, + "num_tokens": 429044147.0, + "step": 11771 + }, + { + "epoch": 2.1860724233983286, + "grad_norm": 1.7586333751678467, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8671818971633911, + "num_tokens": 429080314.0, + "step": 11772 + }, + { + "epoch": 2.1862581244196844, + "grad_norm": 1.6533353328704834, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8766347765922546, + "num_tokens": 429119138.0, + "step": 11773 + }, + { + "epoch": 2.18644382544104, + "grad_norm": 1.6388005018234253, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8837622404098511, + "num_tokens": 429157011.0, + "step": 11774 + }, + { + "epoch": 2.1866295264623954, + "grad_norm": 1.5101014375686646, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8897371292114258, + "num_tokens": 429198453.0, + "step": 11775 + }, + { + "epoch": 2.186815227483751, + "grad_norm": 1.6770260334014893, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8730442523956299, + "num_tokens": 429237831.0, + "step": 11776 + }, + { + "epoch": 2.187000928505107, + "grad_norm": 1.7125729322433472, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8789792060852051, + "num_tokens": 429271157.0, + "step": 11777 + }, + { + "epoch": 2.1871866295264626, + "grad_norm": 1.4746791124343872, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8919913172721863, + "num_tokens": 429310562.0, + "step": 11778 + }, + { + "epoch": 2.187372330547818, + "grad_norm": 1.6084214448928833, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8737785816192627, + "num_tokens": 429350497.0, + "step": 11779 + }, + { + "epoch": 2.1875580315691736, + "grad_norm": 1.6253961324691772, + "learning_rate": 1e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.9004503488540649, + "num_tokens": 429382292.0, + "step": 11780 + }, + { + "epoch": 2.1877437325905293, + "grad_norm": 1.791949987411499, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8782066106796265, + "num_tokens": 429417482.0, + "step": 11781 + }, + { + "epoch": 2.1879294336118846, + "grad_norm": 1.8303759098052979, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8863916993141174, + "num_tokens": 429446088.0, + "step": 11782 + }, + { + "epoch": 2.1881151346332404, + "grad_norm": 1.7722249031066895, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8861517906188965, + "num_tokens": 429477496.0, + "step": 11783 + }, + { + "epoch": 2.188300835654596, + "grad_norm": 1.5319589376449585, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8961260318756104, + "num_tokens": 429517752.0, + "step": 11784 + }, + { + "epoch": 2.188486536675952, + "grad_norm": 1.629948616027832, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8930547833442688, + "num_tokens": 429550675.0, + "step": 11785 + }, + { + "epoch": 2.1886722376973076, + "grad_norm": 1.5749105215072632, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8913174271583557, + "num_tokens": 429586831.0, + "step": 11786 + }, + { + "epoch": 2.188857938718663, + "grad_norm": 1.5978851318359375, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8912655115127563, + "num_tokens": 429623913.0, + "step": 11787 + }, + { + "epoch": 2.1890436397400186, + "grad_norm": 1.6784989833831787, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.895597517490387, + "num_tokens": 429657946.0, + "step": 11788 + }, + { + "epoch": 2.1892293407613743, + "grad_norm": 1.7424564361572266, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8782548904418945, + "num_tokens": 429694671.0, + "step": 11789 + }, + { + "epoch": 2.1894150417827296, + "grad_norm": 1.568569302558899, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8876157999038696, + "num_tokens": 429732187.0, + "step": 11790 + }, + { + "epoch": 2.1896007428040853, + "grad_norm": 1.6813312768936157, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8835371732711792, + "num_tokens": 429767331.0, + "step": 11791 + }, + { + "epoch": 2.189786443825441, + "grad_norm": 1.8358005285263062, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8943111896514893, + "num_tokens": 429793761.0, + "step": 11792 + }, + { + "epoch": 2.189972144846797, + "grad_norm": 1.6459234952926636, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8855980634689331, + "num_tokens": 429830179.0, + "step": 11793 + }, + { + "epoch": 2.190157845868152, + "grad_norm": 1.5900477170944214, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8815847635269165, + "num_tokens": 429870476.0, + "step": 11794 + }, + { + "epoch": 2.190343546889508, + "grad_norm": 1.5858349800109863, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8945184946060181, + "num_tokens": 429904815.0, + "step": 11795 + }, + { + "epoch": 2.1905292479108636, + "grad_norm": 1.8343576192855835, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8823873996734619, + "num_tokens": 429937847.0, + "step": 11796 + }, + { + "epoch": 2.1907149489322193, + "grad_norm": 1.805216908454895, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8991488218307495, + "num_tokens": 429972445.0, + "step": 11797 + }, + { + "epoch": 2.1909006499535746, + "grad_norm": 2.0862655639648438, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8865997791290283, + "num_tokens": 430001107.0, + "step": 11798 + }, + { + "epoch": 2.1910863509749303, + "grad_norm": 1.4989980459213257, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8983839154243469, + "num_tokens": 430039539.0, + "step": 11799 + }, + { + "epoch": 2.191272051996286, + "grad_norm": 1.750234603881836, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8641652464866638, + "num_tokens": 430074555.0, + "step": 11800 + }, + { + "epoch": 2.191457753017642, + "grad_norm": 1.6902060508728027, + "learning_rate": 1e-06, + "loss": 0.2804, + "mean_token_accuracy": 0.9006283283233643, + "num_tokens": 430104599.0, + "step": 11801 + }, + { + "epoch": 2.191643454038997, + "grad_norm": 1.6842139959335327, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8865302801132202, + "num_tokens": 430137764.0, + "step": 11802 + }, + { + "epoch": 2.191829155060353, + "grad_norm": 1.70415461063385, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8822609782218933, + "num_tokens": 430171149.0, + "step": 11803 + }, + { + "epoch": 2.1920148560817085, + "grad_norm": 1.5793235301971436, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8881237506866455, + "num_tokens": 430205875.0, + "step": 11804 + }, + { + "epoch": 2.1922005571030643, + "grad_norm": 1.5099382400512695, + "learning_rate": 1e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.89991295337677, + "num_tokens": 430241387.0, + "step": 11805 + }, + { + "epoch": 2.1923862581244196, + "grad_norm": 1.5713876485824585, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8756558299064636, + "num_tokens": 430283851.0, + "step": 11806 + }, + { + "epoch": 2.1925719591457753, + "grad_norm": 1.6213620901107788, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8767539262771606, + "num_tokens": 430325062.0, + "step": 11807 + }, + { + "epoch": 2.192757660167131, + "grad_norm": 1.561553955078125, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8892862200737, + "num_tokens": 430358094.0, + "step": 11808 + }, + { + "epoch": 2.1929433611884868, + "grad_norm": 1.5907888412475586, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.86814284324646, + "num_tokens": 430399415.0, + "step": 11809 + }, + { + "epoch": 2.193129062209842, + "grad_norm": 1.5611965656280518, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8849852681159973, + "num_tokens": 430437963.0, + "step": 11810 + }, + { + "epoch": 2.193314763231198, + "grad_norm": 1.6518906354904175, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8751665949821472, + "num_tokens": 430474156.0, + "step": 11811 + }, + { + "epoch": 2.1935004642525535, + "grad_norm": 1.5247536897659302, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8977433443069458, + "num_tokens": 430511809.0, + "step": 11812 + }, + { + "epoch": 2.193686165273909, + "grad_norm": 1.5313246250152588, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8836055994033813, + "num_tokens": 430553221.0, + "step": 11813 + }, + { + "epoch": 2.1938718662952645, + "grad_norm": 1.7133721113204956, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.882434070110321, + "num_tokens": 430584019.0, + "step": 11814 + }, + { + "epoch": 2.1940575673166203, + "grad_norm": 1.4908840656280518, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8823872208595276, + "num_tokens": 430627639.0, + "step": 11815 + }, + { + "epoch": 2.194243268337976, + "grad_norm": 1.7280417680740356, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8938212394714355, + "num_tokens": 430658130.0, + "step": 11816 + }, + { + "epoch": 2.1944289693593313, + "grad_norm": 1.7140283584594727, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8922388553619385, + "num_tokens": 430689157.0, + "step": 11817 + }, + { + "epoch": 2.194614670380687, + "grad_norm": 1.4059046506881714, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.902610719203949, + "num_tokens": 430732151.0, + "step": 11818 + }, + { + "epoch": 2.1948003714020428, + "grad_norm": 1.6322574615478516, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8882288932800293, + "num_tokens": 430767029.0, + "step": 11819 + }, + { + "epoch": 2.1949860724233985, + "grad_norm": 1.626373052597046, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8887283205986023, + "num_tokens": 430803663.0, + "step": 11820 + }, + { + "epoch": 2.195171773444754, + "grad_norm": 1.4434962272644043, + "learning_rate": 1e-06, + "loss": 0.2641, + "mean_token_accuracy": 0.9008598327636719, + "num_tokens": 430843434.0, + "step": 11821 + }, + { + "epoch": 2.1953574744661095, + "grad_norm": 1.6445263624191284, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8890282511711121, + "num_tokens": 430876195.0, + "step": 11822 + }, + { + "epoch": 2.1955431754874652, + "grad_norm": 1.556395411491394, + "learning_rate": 1e-06, + "loss": 0.2747, + "mean_token_accuracy": 0.8991268277168274, + "num_tokens": 430910921.0, + "step": 11823 + }, + { + "epoch": 2.195728876508821, + "grad_norm": 1.575725793838501, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8896549940109253, + "num_tokens": 430945596.0, + "step": 11824 + }, + { + "epoch": 2.1959145775301763, + "grad_norm": 1.6272776126861572, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8806157112121582, + "num_tokens": 430979202.0, + "step": 11825 + }, + { + "epoch": 2.196100278551532, + "grad_norm": 1.6771059036254883, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8902801275253296, + "num_tokens": 431013994.0, + "step": 11826 + }, + { + "epoch": 2.1962859795728877, + "grad_norm": 1.7219103574752808, + "learning_rate": 1e-06, + "loss": 0.26, + "mean_token_accuracy": 0.9049422144889832, + "num_tokens": 431041244.0, + "step": 11827 + }, + { + "epoch": 2.1964716805942435, + "grad_norm": 1.6993037462234497, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8847838044166565, + "num_tokens": 431071456.0, + "step": 11828 + }, + { + "epoch": 2.1966573816155988, + "grad_norm": 1.7968013286590576, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8940181732177734, + "num_tokens": 431101873.0, + "step": 11829 + }, + { + "epoch": 2.1968430826369545, + "grad_norm": 1.7641955614089966, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8602521419525146, + "num_tokens": 431135385.0, + "step": 11830 + }, + { + "epoch": 2.1970287836583102, + "grad_norm": 1.6036111116409302, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8787813186645508, + "num_tokens": 431171177.0, + "step": 11831 + }, + { + "epoch": 2.197214484679666, + "grad_norm": 1.7718907594680786, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8862466216087341, + "num_tokens": 431203721.0, + "step": 11832 + }, + { + "epoch": 2.1974001857010212, + "grad_norm": 1.8263293504714966, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8887113928794861, + "num_tokens": 431232512.0, + "step": 11833 + }, + { + "epoch": 2.197585886722377, + "grad_norm": 1.5859911441802979, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.895428478717804, + "num_tokens": 431265585.0, + "step": 11834 + }, + { + "epoch": 2.1977715877437327, + "grad_norm": 1.669845461845398, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8970494270324707, + "num_tokens": 431298935.0, + "step": 11835 + }, + { + "epoch": 2.197957288765088, + "grad_norm": 1.6420490741729736, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8863786458969116, + "num_tokens": 431332729.0, + "step": 11836 + }, + { + "epoch": 2.1981429897864437, + "grad_norm": 1.5588772296905518, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8978005647659302, + "num_tokens": 431369769.0, + "step": 11837 + }, + { + "epoch": 2.1983286908077995, + "grad_norm": 1.6485506296157837, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8753980398178101, + "num_tokens": 431406393.0, + "step": 11838 + }, + { + "epoch": 2.198514391829155, + "grad_norm": 1.5593748092651367, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8799208402633667, + "num_tokens": 431448939.0, + "step": 11839 + }, + { + "epoch": 2.1987000928505105, + "grad_norm": 1.5885872840881348, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8873966932296753, + "num_tokens": 431484067.0, + "step": 11840 + }, + { + "epoch": 2.1988857938718662, + "grad_norm": 1.7239890098571777, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8825181126594543, + "num_tokens": 431518567.0, + "step": 11841 + }, + { + "epoch": 2.199071494893222, + "grad_norm": 1.61311936378479, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.883773922920227, + "num_tokens": 431554230.0, + "step": 11842 + }, + { + "epoch": 2.1992571959145777, + "grad_norm": 1.4738277196884155, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.8949777483940125, + "num_tokens": 431593586.0, + "step": 11843 + }, + { + "epoch": 2.199442896935933, + "grad_norm": 1.6928163766860962, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8739719986915588, + "num_tokens": 431629094.0, + "step": 11844 + }, + { + "epoch": 2.1996285979572887, + "grad_norm": 1.6970281600952148, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8867183923721313, + "num_tokens": 431665337.0, + "step": 11845 + }, + { + "epoch": 2.1998142989786444, + "grad_norm": 1.6240980625152588, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8756240606307983, + "num_tokens": 431701843.0, + "step": 11846 + }, + { + "epoch": 2.2, + "grad_norm": 1.5708322525024414, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8818458914756775, + "num_tokens": 431745459.0, + "step": 11847 + }, + { + "epoch": 2.2001857010213555, + "grad_norm": 1.6230143308639526, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8950228691101074, + "num_tokens": 431783642.0, + "step": 11848 + }, + { + "epoch": 2.200371402042711, + "grad_norm": 1.6408679485321045, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8857809901237488, + "num_tokens": 431821931.0, + "step": 11849 + }, + { + "epoch": 2.200557103064067, + "grad_norm": 1.7425439357757568, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8749962449073792, + "num_tokens": 431857757.0, + "step": 11850 + }, + { + "epoch": 2.2007428040854227, + "grad_norm": 1.7613576650619507, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8712530136108398, + "num_tokens": 431891958.0, + "step": 11851 + }, + { + "epoch": 2.200928505106778, + "grad_norm": 1.7148889303207397, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8916026949882507, + "num_tokens": 431925444.0, + "step": 11852 + }, + { + "epoch": 2.2011142061281337, + "grad_norm": 1.571301817893982, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8945187330245972, + "num_tokens": 431963663.0, + "step": 11853 + }, + { + "epoch": 2.2012999071494894, + "grad_norm": 1.8188717365264893, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8858412504196167, + "num_tokens": 431996232.0, + "step": 11854 + }, + { + "epoch": 2.201485608170845, + "grad_norm": 1.640479564666748, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8934032917022705, + "num_tokens": 432033244.0, + "step": 11855 + }, + { + "epoch": 2.2016713091922004, + "grad_norm": 1.5260285139083862, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8949432373046875, + "num_tokens": 432069635.0, + "step": 11856 + }, + { + "epoch": 2.201857010213556, + "grad_norm": 1.636657953262329, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8731393814086914, + "num_tokens": 432105720.0, + "step": 11857 + }, + { + "epoch": 2.202042711234912, + "grad_norm": 1.5345507860183716, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8918513059616089, + "num_tokens": 432142107.0, + "step": 11858 + }, + { + "epoch": 2.202228412256267, + "grad_norm": 1.6477131843566895, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8799492120742798, + "num_tokens": 432185887.0, + "step": 11859 + }, + { + "epoch": 2.202414113277623, + "grad_norm": 1.713487148284912, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8835828304290771, + "num_tokens": 432223537.0, + "step": 11860 + }, + { + "epoch": 2.2025998142989787, + "grad_norm": 1.608970046043396, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9008170962333679, + "num_tokens": 432255169.0, + "step": 11861 + }, + { + "epoch": 2.2027855153203344, + "grad_norm": 1.5902628898620605, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8834909200668335, + "num_tokens": 432292856.0, + "step": 11862 + }, + { + "epoch": 2.2029712163416897, + "grad_norm": 1.5870170593261719, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8813589811325073, + "num_tokens": 432333628.0, + "step": 11863 + }, + { + "epoch": 2.2031569173630454, + "grad_norm": 1.845057725906372, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8851993680000305, + "num_tokens": 432367323.0, + "step": 11864 + }, + { + "epoch": 2.203342618384401, + "grad_norm": 1.7382433414459229, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8797547817230225, + "num_tokens": 432402885.0, + "step": 11865 + }, + { + "epoch": 2.203528319405757, + "grad_norm": 1.8260105848312378, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.875961184501648, + "num_tokens": 432438174.0, + "step": 11866 + }, + { + "epoch": 2.203714020427112, + "grad_norm": 2.509847640991211, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8676126003265381, + "num_tokens": 432472791.0, + "step": 11867 + }, + { + "epoch": 2.203899721448468, + "grad_norm": 1.5073773860931396, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8842231035232544, + "num_tokens": 432511414.0, + "step": 11868 + }, + { + "epoch": 2.2040854224698236, + "grad_norm": 1.5313600301742554, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8984581232070923, + "num_tokens": 432547712.0, + "step": 11869 + }, + { + "epoch": 2.2042711234911794, + "grad_norm": 1.6962867975234985, + "learning_rate": 1e-06, + "loss": 0.2769, + "mean_token_accuracy": 0.9017961025238037, + "num_tokens": 432579144.0, + "step": 11870 + }, + { + "epoch": 2.2044568245125347, + "grad_norm": 1.6708779335021973, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.889629602432251, + "num_tokens": 432610807.0, + "step": 11871 + }, + { + "epoch": 2.2046425255338904, + "grad_norm": 1.6155788898468018, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8851902484893799, + "num_tokens": 432646177.0, + "step": 11872 + }, + { + "epoch": 2.204828226555246, + "grad_norm": 1.706812858581543, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.875300407409668, + "num_tokens": 432680266.0, + "step": 11873 + }, + { + "epoch": 2.205013927576602, + "grad_norm": 1.6157970428466797, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8925488591194153, + "num_tokens": 432717933.0, + "step": 11874 + }, + { + "epoch": 2.205199628597957, + "grad_norm": 1.9524720907211304, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8911494612693787, + "num_tokens": 432747258.0, + "step": 11875 + }, + { + "epoch": 2.205385329619313, + "grad_norm": 1.798524260520935, + "learning_rate": 1e-06, + "loss": 0.2831, + "mean_token_accuracy": 0.900746762752533, + "num_tokens": 432778829.0, + "step": 11876 + }, + { + "epoch": 2.2055710306406686, + "grad_norm": 1.6080125570297241, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8809837698936462, + "num_tokens": 432814951.0, + "step": 11877 + }, + { + "epoch": 2.2057567316620244, + "grad_norm": 1.699911117553711, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8870375156402588, + "num_tokens": 432848579.0, + "step": 11878 + }, + { + "epoch": 2.2059424326833796, + "grad_norm": 1.6860324144363403, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8775023221969604, + "num_tokens": 432884926.0, + "step": 11879 + }, + { + "epoch": 2.2061281337047354, + "grad_norm": 1.7475149631500244, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8759082555770874, + "num_tokens": 432920594.0, + "step": 11880 + }, + { + "epoch": 2.206313834726091, + "grad_norm": 1.6295864582061768, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8803945779800415, + "num_tokens": 432959980.0, + "step": 11881 + }, + { + "epoch": 2.2064995357474464, + "grad_norm": 1.4702268838882446, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8913549780845642, + "num_tokens": 433000125.0, + "step": 11882 + }, + { + "epoch": 2.206685236768802, + "grad_norm": 1.5996021032333374, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8848885893821716, + "num_tokens": 433036861.0, + "step": 11883 + }, + { + "epoch": 2.206870937790158, + "grad_norm": 1.6875603199005127, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.892130434513092, + "num_tokens": 433070455.0, + "step": 11884 + }, + { + "epoch": 2.2070566388115136, + "grad_norm": 1.55965256690979, + "learning_rate": 1e-06, + "loss": 0.2655, + "mean_token_accuracy": 0.9015060663223267, + "num_tokens": 433105244.0, + "step": 11885 + }, + { + "epoch": 2.207242339832869, + "grad_norm": 1.8207972049713135, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8830220699310303, + "num_tokens": 433138495.0, + "step": 11886 + }, + { + "epoch": 2.2074280408542246, + "grad_norm": 1.7170659303665161, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8779268860816956, + "num_tokens": 433172984.0, + "step": 11887 + }, + { + "epoch": 2.2076137418755803, + "grad_norm": 1.619299292564392, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8809552192687988, + "num_tokens": 433209388.0, + "step": 11888 + }, + { + "epoch": 2.207799442896936, + "grad_norm": 1.6933629512786865, + "learning_rate": 1e-06, + "loss": 0.2656, + "mean_token_accuracy": 0.9051992893218994, + "num_tokens": 433239842.0, + "step": 11889 + }, + { + "epoch": 2.2079851439182914, + "grad_norm": 1.6547696590423584, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8743629455566406, + "num_tokens": 433276023.0, + "step": 11890 + }, + { + "epoch": 2.208170844939647, + "grad_norm": 1.716016411781311, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8879483342170715, + "num_tokens": 433309859.0, + "step": 11891 + }, + { + "epoch": 2.208356545961003, + "grad_norm": 1.691304326057434, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8835758566856384, + "num_tokens": 433345369.0, + "step": 11892 + }, + { + "epoch": 2.2085422469823586, + "grad_norm": 1.689634919166565, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8916059732437134, + "num_tokens": 433380647.0, + "step": 11893 + }, + { + "epoch": 2.208727948003714, + "grad_norm": 1.8208197355270386, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8901987075805664, + "num_tokens": 433412023.0, + "step": 11894 + }, + { + "epoch": 2.2089136490250696, + "grad_norm": 1.7304168939590454, + "learning_rate": 1e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.8949595093727112, + "num_tokens": 433441432.0, + "step": 11895 + }, + { + "epoch": 2.2090993500464253, + "grad_norm": 1.5460472106933594, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.885303795337677, + "num_tokens": 433478543.0, + "step": 11896 + }, + { + "epoch": 2.209285051067781, + "grad_norm": 1.648919701576233, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8778148889541626, + "num_tokens": 433514935.0, + "step": 11897 + }, + { + "epoch": 2.2094707520891363, + "grad_norm": 1.613023281097412, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8970808982849121, + "num_tokens": 433549168.0, + "step": 11898 + }, + { + "epoch": 2.209656453110492, + "grad_norm": 1.61306631565094, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.889161229133606, + "num_tokens": 433584521.0, + "step": 11899 + }, + { + "epoch": 2.209842154131848, + "grad_norm": 1.7069530487060547, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8842966556549072, + "num_tokens": 433616833.0, + "step": 11900 + }, + { + "epoch": 2.2100278551532035, + "grad_norm": 1.816232442855835, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8852949142456055, + "num_tokens": 433645084.0, + "step": 11901 + }, + { + "epoch": 2.210213556174559, + "grad_norm": 1.5621079206466675, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.881690502166748, + "num_tokens": 433686941.0, + "step": 11902 + }, + { + "epoch": 2.2103992571959146, + "grad_norm": 1.65900456905365, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.89364093542099, + "num_tokens": 433719127.0, + "step": 11903 + }, + { + "epoch": 2.2105849582172703, + "grad_norm": 1.5963267087936401, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8951247930526733, + "num_tokens": 433752586.0, + "step": 11904 + }, + { + "epoch": 2.2107706592386256, + "grad_norm": 1.524766206741333, + "learning_rate": 1e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.8977348208427429, + "num_tokens": 433787456.0, + "step": 11905 + }, + { + "epoch": 2.2109563602599813, + "grad_norm": 1.4642155170440674, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8895821571350098, + "num_tokens": 433827591.0, + "step": 11906 + }, + { + "epoch": 2.211142061281337, + "grad_norm": 1.4515469074249268, + "learning_rate": 1e-06, + "loss": 0.2618, + "mean_token_accuracy": 0.9040685892105103, + "num_tokens": 433866986.0, + "step": 11907 + }, + { + "epoch": 2.211327762302693, + "grad_norm": 1.5385544300079346, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8800591230392456, + "num_tokens": 433908147.0, + "step": 11908 + }, + { + "epoch": 2.211513463324048, + "grad_norm": 1.5864307880401611, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8820637464523315, + "num_tokens": 433948443.0, + "step": 11909 + }, + { + "epoch": 2.211699164345404, + "grad_norm": 1.7074326276779175, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8850870132446289, + "num_tokens": 433984404.0, + "step": 11910 + }, + { + "epoch": 2.2118848653667595, + "grad_norm": 1.576841950416565, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.885611891746521, + "num_tokens": 434025693.0, + "step": 11911 + }, + { + "epoch": 2.2120705663881153, + "grad_norm": 1.6355952024459839, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8721671104431152, + "num_tokens": 434063702.0, + "step": 11912 + }, + { + "epoch": 2.2122562674094706, + "grad_norm": 1.4881867170333862, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.892742395401001, + "num_tokens": 434102569.0, + "step": 11913 + }, + { + "epoch": 2.2124419684308263, + "grad_norm": 1.6373709440231323, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8989413380622864, + "num_tokens": 434137170.0, + "step": 11914 + }, + { + "epoch": 2.212627669452182, + "grad_norm": 1.637032151222229, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8941120505332947, + "num_tokens": 434170450.0, + "step": 11915 + }, + { + "epoch": 2.2128133704735378, + "grad_norm": 1.6489747762680054, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8835040330886841, + "num_tokens": 434207088.0, + "step": 11916 + }, + { + "epoch": 2.212999071494893, + "grad_norm": 1.549688696861267, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.898731529712677, + "num_tokens": 434243912.0, + "step": 11917 + }, + { + "epoch": 2.213184772516249, + "grad_norm": 1.6409248113632202, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8916288614273071, + "num_tokens": 434274944.0, + "step": 11918 + }, + { + "epoch": 2.2133704735376045, + "grad_norm": 1.6624155044555664, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.896183967590332, + "num_tokens": 434309892.0, + "step": 11919 + }, + { + "epoch": 2.2135561745589603, + "grad_norm": 1.658631682395935, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.890315055847168, + "num_tokens": 434346477.0, + "step": 11920 + }, + { + "epoch": 2.2137418755803155, + "grad_norm": 1.591326355934143, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8766418695449829, + "num_tokens": 434386211.0, + "step": 11921 + }, + { + "epoch": 2.2139275766016713, + "grad_norm": 1.626169204711914, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8887624740600586, + "num_tokens": 434419192.0, + "step": 11922 + }, + { + "epoch": 2.214113277623027, + "grad_norm": 1.6731542348861694, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8889075517654419, + "num_tokens": 434451355.0, + "step": 11923 + }, + { + "epoch": 2.2142989786443827, + "grad_norm": 1.4824471473693848, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8923234939575195, + "num_tokens": 434492288.0, + "step": 11924 + }, + { + "epoch": 2.214484679665738, + "grad_norm": 1.830807089805603, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8751684427261353, + "num_tokens": 434526506.0, + "step": 11925 + }, + { + "epoch": 2.2146703806870938, + "grad_norm": 1.6170099973678589, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8864020109176636, + "num_tokens": 434565998.0, + "step": 11926 + }, + { + "epoch": 2.2148560817084495, + "grad_norm": 1.5809074640274048, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.8978829383850098, + "num_tokens": 434603690.0, + "step": 11927 + }, + { + "epoch": 2.215041782729805, + "grad_norm": 1.5839054584503174, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8867484331130981, + "num_tokens": 434642814.0, + "step": 11928 + }, + { + "epoch": 2.2152274837511605, + "grad_norm": 1.7908729314804077, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8743627071380615, + "num_tokens": 434677363.0, + "step": 11929 + }, + { + "epoch": 2.2154131847725163, + "grad_norm": 1.7919611930847168, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8791742324829102, + "num_tokens": 434714525.0, + "step": 11930 + }, + { + "epoch": 2.215598885793872, + "grad_norm": 1.6488324403762817, + "learning_rate": 1e-06, + "loss": 0.2518, + "mean_token_accuracy": 0.9069492220878601, + "num_tokens": 434743911.0, + "step": 11931 + }, + { + "epoch": 2.2157845868152277, + "grad_norm": 1.593629240989685, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8836443424224854, + "num_tokens": 434781938.0, + "step": 11932 + }, + { + "epoch": 2.215970287836583, + "grad_norm": 1.59132981300354, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8860028386116028, + "num_tokens": 434822132.0, + "step": 11933 + }, + { + "epoch": 2.2161559888579387, + "grad_norm": 1.7383302450180054, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.881624162197113, + "num_tokens": 434855941.0, + "step": 11934 + }, + { + "epoch": 2.2163416898792945, + "grad_norm": 1.805199384689331, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8743071556091309, + "num_tokens": 434888523.0, + "step": 11935 + }, + { + "epoch": 2.2165273909006498, + "grad_norm": 1.5540837049484253, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8914315700531006, + "num_tokens": 434928114.0, + "step": 11936 + }, + { + "epoch": 2.2167130919220055, + "grad_norm": 1.8025953769683838, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8922157287597656, + "num_tokens": 434956869.0, + "step": 11937 + }, + { + "epoch": 2.2168987929433612, + "grad_norm": 1.598109483718872, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.884167492389679, + "num_tokens": 434993169.0, + "step": 11938 + }, + { + "epoch": 2.217084493964717, + "grad_norm": 1.5733880996704102, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8936632871627808, + "num_tokens": 435031303.0, + "step": 11939 + }, + { + "epoch": 2.2172701949860723, + "grad_norm": 1.518674373626709, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8786342144012451, + "num_tokens": 435073990.0, + "step": 11940 + }, + { + "epoch": 2.217455896007428, + "grad_norm": 1.6735084056854248, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8706374168395996, + "num_tokens": 435109977.0, + "step": 11941 + }, + { + "epoch": 2.2176415970287837, + "grad_norm": 1.73655366897583, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8755121231079102, + "num_tokens": 435144063.0, + "step": 11942 + }, + { + "epoch": 2.2178272980501395, + "grad_norm": 1.6125059127807617, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8963929414749146, + "num_tokens": 435175963.0, + "step": 11943 + }, + { + "epoch": 2.2180129990714947, + "grad_norm": 1.6760680675506592, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8805662393569946, + "num_tokens": 435213351.0, + "step": 11944 + }, + { + "epoch": 2.2181987000928505, + "grad_norm": 1.4257901906967163, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8992806673049927, + "num_tokens": 435251633.0, + "step": 11945 + }, + { + "epoch": 2.218384401114206, + "grad_norm": 1.6481572389602661, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8800125122070312, + "num_tokens": 435286170.0, + "step": 11946 + }, + { + "epoch": 2.218570102135562, + "grad_norm": 1.5594046115875244, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8965930938720703, + "num_tokens": 435321646.0, + "step": 11947 + }, + { + "epoch": 2.2187558031569172, + "grad_norm": 1.683327317237854, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8855383992195129, + "num_tokens": 435355369.0, + "step": 11948 + }, + { + "epoch": 2.218941504178273, + "grad_norm": 1.5858452320098877, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8938893675804138, + "num_tokens": 435389972.0, + "step": 11949 + }, + { + "epoch": 2.2191272051996287, + "grad_norm": 1.5251951217651367, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8799092769622803, + "num_tokens": 435430842.0, + "step": 11950 + }, + { + "epoch": 2.219312906220984, + "grad_norm": 1.5722230672836304, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8924721479415894, + "num_tokens": 435469279.0, + "step": 11951 + }, + { + "epoch": 2.2194986072423397, + "grad_norm": 1.538959264755249, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8764642477035522, + "num_tokens": 435512526.0, + "step": 11952 + }, + { + "epoch": 2.2196843082636954, + "grad_norm": 1.6326292753219604, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8876924514770508, + "num_tokens": 435548333.0, + "step": 11953 + }, + { + "epoch": 2.219870009285051, + "grad_norm": 1.6737855672836304, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8836861252784729, + "num_tokens": 435583552.0, + "step": 11954 + }, + { + "epoch": 2.220055710306407, + "grad_norm": 1.6672755479812622, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.876492440700531, + "num_tokens": 435622596.0, + "step": 11955 + }, + { + "epoch": 2.220241411327762, + "grad_norm": 1.542587161064148, + "learning_rate": 1e-06, + "loss": 0.28, + "mean_token_accuracy": 0.8989909887313843, + "num_tokens": 435660113.0, + "step": 11956 + }, + { + "epoch": 2.220427112349118, + "grad_norm": 1.7894525527954102, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8837429285049438, + "num_tokens": 435693341.0, + "step": 11957 + }, + { + "epoch": 2.2206128133704737, + "grad_norm": 1.7542544603347778, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8747673034667969, + "num_tokens": 435729238.0, + "step": 11958 + }, + { + "epoch": 2.220798514391829, + "grad_norm": 1.585533857345581, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8954206705093384, + "num_tokens": 435768945.0, + "step": 11959 + }, + { + "epoch": 2.2209842154131847, + "grad_norm": 1.737380027770996, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8822829723358154, + "num_tokens": 435801118.0, + "step": 11960 + }, + { + "epoch": 2.2211699164345404, + "grad_norm": 1.4896504878997803, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8859103322029114, + "num_tokens": 435841946.0, + "step": 11961 + }, + { + "epoch": 2.221355617455896, + "grad_norm": 1.5918288230895996, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8835777640342712, + "num_tokens": 435882981.0, + "step": 11962 + }, + { + "epoch": 2.2215413184772514, + "grad_norm": 1.4906915426254272, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.895670473575592, + "num_tokens": 435917928.0, + "step": 11963 + }, + { + "epoch": 2.221727019498607, + "grad_norm": 1.6568373441696167, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8819430470466614, + "num_tokens": 435951704.0, + "step": 11964 + }, + { + "epoch": 2.221912720519963, + "grad_norm": 1.5019046068191528, + "learning_rate": 1e-06, + "loss": 0.2669, + "mean_token_accuracy": 0.9027581214904785, + "num_tokens": 435986688.0, + "step": 11965 + }, + { + "epoch": 2.2220984215413186, + "grad_norm": 1.648033857345581, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.896111249923706, + "num_tokens": 436019374.0, + "step": 11966 + }, + { + "epoch": 2.222284122562674, + "grad_norm": 1.731143593788147, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8872085809707642, + "num_tokens": 436052091.0, + "step": 11967 + }, + { + "epoch": 2.2224698235840297, + "grad_norm": 1.7152471542358398, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8959167003631592, + "num_tokens": 436087005.0, + "step": 11968 + }, + { + "epoch": 2.2226555246053854, + "grad_norm": 1.664557933807373, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8869059085845947, + "num_tokens": 436123219.0, + "step": 11969 + }, + { + "epoch": 2.222841225626741, + "grad_norm": 1.583105206489563, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8948837518692017, + "num_tokens": 436158343.0, + "step": 11970 + }, + { + "epoch": 2.2230269266480964, + "grad_norm": 1.8014601469039917, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8719804286956787, + "num_tokens": 436192578.0, + "step": 11971 + }, + { + "epoch": 2.223212627669452, + "grad_norm": 1.553597092628479, + "learning_rate": 1e-06, + "loss": 0.2746, + "mean_token_accuracy": 0.9016841650009155, + "num_tokens": 436227478.0, + "step": 11972 + }, + { + "epoch": 2.223398328690808, + "grad_norm": 1.6789207458496094, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8771965503692627, + "num_tokens": 436264013.0, + "step": 11973 + }, + { + "epoch": 2.2235840297121636, + "grad_norm": 1.663442850112915, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8911494016647339, + "num_tokens": 436299957.0, + "step": 11974 + }, + { + "epoch": 2.223769730733519, + "grad_norm": 1.5703799724578857, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8782623410224915, + "num_tokens": 436340530.0, + "step": 11975 + }, + { + "epoch": 2.2239554317548746, + "grad_norm": 1.745432734489441, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.8968595862388611, + "num_tokens": 436369184.0, + "step": 11976 + }, + { + "epoch": 2.2241411327762304, + "grad_norm": 1.6939446926116943, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8852121233940125, + "num_tokens": 436401519.0, + "step": 11977 + }, + { + "epoch": 2.224326833797586, + "grad_norm": 1.5930917263031006, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8838115334510803, + "num_tokens": 436439296.0, + "step": 11978 + }, + { + "epoch": 2.2245125348189414, + "grad_norm": 1.6433651447296143, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8844285011291504, + "num_tokens": 436475882.0, + "step": 11979 + }, + { + "epoch": 2.224698235840297, + "grad_norm": 1.660078525543213, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8805582523345947, + "num_tokens": 436512585.0, + "step": 11980 + }, + { + "epoch": 2.224883936861653, + "grad_norm": 1.6498911380767822, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8923415541648865, + "num_tokens": 436548704.0, + "step": 11981 + }, + { + "epoch": 2.225069637883008, + "grad_norm": 1.5355358123779297, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8883135914802551, + "num_tokens": 436585138.0, + "step": 11982 + }, + { + "epoch": 2.225255338904364, + "grad_norm": 1.5688533782958984, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8773571252822876, + "num_tokens": 436624467.0, + "step": 11983 + }, + { + "epoch": 2.2254410399257196, + "grad_norm": 1.589412808418274, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8847651481628418, + "num_tokens": 436661487.0, + "step": 11984 + }, + { + "epoch": 2.2256267409470754, + "grad_norm": 1.4283065795898438, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8800419569015503, + "num_tokens": 436707432.0, + "step": 11985 + }, + { + "epoch": 2.2258124419684306, + "grad_norm": 1.5463613271713257, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8885694742202759, + "num_tokens": 436743669.0, + "step": 11986 + }, + { + "epoch": 2.2259981429897864, + "grad_norm": 1.589845061302185, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.891700804233551, + "num_tokens": 436779256.0, + "step": 11987 + }, + { + "epoch": 2.226183844011142, + "grad_norm": 1.6091010570526123, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8804585337638855, + "num_tokens": 436815359.0, + "step": 11988 + }, + { + "epoch": 2.226369545032498, + "grad_norm": 1.5383204221725464, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.882298469543457, + "num_tokens": 436855132.0, + "step": 11989 + }, + { + "epoch": 2.226555246053853, + "grad_norm": 1.5272507667541504, + "learning_rate": 1e-06, + "loss": 0.2645, + "mean_token_accuracy": 0.9034937620162964, + "num_tokens": 436889483.0, + "step": 11990 + }, + { + "epoch": 2.226740947075209, + "grad_norm": 1.5924497842788696, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8845880031585693, + "num_tokens": 436928688.0, + "step": 11991 + }, + { + "epoch": 2.2269266480965646, + "grad_norm": 1.7270140647888184, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.879183292388916, + "num_tokens": 436960534.0, + "step": 11992 + }, + { + "epoch": 2.2271123491179203, + "grad_norm": 1.5930380821228027, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8986432552337646, + "num_tokens": 436995182.0, + "step": 11993 + }, + { + "epoch": 2.2272980501392756, + "grad_norm": 1.5338999032974243, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8812750577926636, + "num_tokens": 437032721.0, + "step": 11994 + }, + { + "epoch": 2.2274837511606314, + "grad_norm": 1.5951799154281616, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8835245370864868, + "num_tokens": 437067603.0, + "step": 11995 + }, + { + "epoch": 2.227669452181987, + "grad_norm": 1.6601979732513428, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8853137493133545, + "num_tokens": 437103033.0, + "step": 11996 + }, + { + "epoch": 2.227855153203343, + "grad_norm": 1.6783485412597656, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8857219219207764, + "num_tokens": 437137224.0, + "step": 11997 + }, + { + "epoch": 2.228040854224698, + "grad_norm": 1.5096567869186401, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8899605870246887, + "num_tokens": 437175211.0, + "step": 11998 + }, + { + "epoch": 2.228226555246054, + "grad_norm": 1.4492515325546265, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8955100178718567, + "num_tokens": 437216118.0, + "step": 11999 + }, + { + "epoch": 2.2284122562674096, + "grad_norm": 1.7664072513580322, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8961641788482666, + "num_tokens": 437244217.0, + "step": 12000 + }, + { + "epoch": 2.2285979572887653, + "grad_norm": 1.6974509954452515, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8820602297782898, + "num_tokens": 437278946.0, + "step": 12001 + }, + { + "epoch": 2.2287836583101206, + "grad_norm": 1.6432503461837769, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8822047710418701, + "num_tokens": 437313758.0, + "step": 12002 + }, + { + "epoch": 2.2289693593314763, + "grad_norm": 1.6218781471252441, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8752692937850952, + "num_tokens": 437352869.0, + "step": 12003 + }, + { + "epoch": 2.229155060352832, + "grad_norm": 1.5652532577514648, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8807013034820557, + "num_tokens": 437392872.0, + "step": 12004 + }, + { + "epoch": 2.2293407613741874, + "grad_norm": 1.6744052171707153, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9001611471176147, + "num_tokens": 437427686.0, + "step": 12005 + }, + { + "epoch": 2.229526462395543, + "grad_norm": 1.575981855392456, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8971362709999084, + "num_tokens": 437465296.0, + "step": 12006 + }, + { + "epoch": 2.229712163416899, + "grad_norm": 1.786043405532837, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8851935267448425, + "num_tokens": 437497443.0, + "step": 12007 + }, + { + "epoch": 2.2298978644382546, + "grad_norm": 1.6986342668533325, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8836207389831543, + "num_tokens": 437530698.0, + "step": 12008 + }, + { + "epoch": 2.23008356545961, + "grad_norm": 1.6566210985183716, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8912222385406494, + "num_tokens": 437566188.0, + "step": 12009 + }, + { + "epoch": 2.2302692664809656, + "grad_norm": 1.6505714654922485, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8801787495613098, + "num_tokens": 437602588.0, + "step": 12010 + }, + { + "epoch": 2.2304549675023213, + "grad_norm": 1.4443204402923584, + "learning_rate": 1e-06, + "loss": 0.2649, + "mean_token_accuracy": 0.9056622385978699, + "num_tokens": 437639216.0, + "step": 12011 + }, + { + "epoch": 2.230640668523677, + "grad_norm": 1.5926848649978638, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8900629281997681, + "num_tokens": 437675828.0, + "step": 12012 + }, + { + "epoch": 2.2308263695450323, + "grad_norm": 1.5843219757080078, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8974238634109497, + "num_tokens": 437712305.0, + "step": 12013 + }, + { + "epoch": 2.231012070566388, + "grad_norm": 1.7591627836227417, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8850001096725464, + "num_tokens": 437746470.0, + "step": 12014 + }, + { + "epoch": 2.231197771587744, + "grad_norm": 1.5423797369003296, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8934981226921082, + "num_tokens": 437781548.0, + "step": 12015 + }, + { + "epoch": 2.2313834726090995, + "grad_norm": 1.6222912073135376, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.887391209602356, + "num_tokens": 437817702.0, + "step": 12016 + }, + { + "epoch": 2.231569173630455, + "grad_norm": 1.6597752571105957, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8751153945922852, + "num_tokens": 437856392.0, + "step": 12017 + }, + { + "epoch": 2.2317548746518105, + "grad_norm": 1.7061656713485718, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8744359016418457, + "num_tokens": 437890409.0, + "step": 12018 + }, + { + "epoch": 2.2319405756731663, + "grad_norm": 1.6186330318450928, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8953009843826294, + "num_tokens": 437926471.0, + "step": 12019 + }, + { + "epoch": 2.232126276694522, + "grad_norm": 1.4964895248413086, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8943634629249573, + "num_tokens": 437962774.0, + "step": 12020 + }, + { + "epoch": 2.2323119777158773, + "grad_norm": 1.6357775926589966, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8823689222335815, + "num_tokens": 437998093.0, + "step": 12021 + }, + { + "epoch": 2.232497678737233, + "grad_norm": 1.6181395053863525, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8848056793212891, + "num_tokens": 438035529.0, + "step": 12022 + }, + { + "epoch": 2.2326833797585888, + "grad_norm": 1.5815142393112183, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8893292546272278, + "num_tokens": 438073040.0, + "step": 12023 + }, + { + "epoch": 2.2328690807799445, + "grad_norm": 1.6956678628921509, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8858271837234497, + "num_tokens": 438106314.0, + "step": 12024 + }, + { + "epoch": 2.2330547818013, + "grad_norm": 1.6510478258132935, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8733587861061096, + "num_tokens": 438142849.0, + "step": 12025 + }, + { + "epoch": 2.2332404828226555, + "grad_norm": 1.4822543859481812, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8828263282775879, + "num_tokens": 438184216.0, + "step": 12026 + }, + { + "epoch": 2.2334261838440113, + "grad_norm": 1.6391332149505615, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8946599960327148, + "num_tokens": 438220416.0, + "step": 12027 + }, + { + "epoch": 2.2336118848653665, + "grad_norm": 1.6454275846481323, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8810828924179077, + "num_tokens": 438255953.0, + "step": 12028 + }, + { + "epoch": 2.2337975858867223, + "grad_norm": 1.5110355615615845, + "learning_rate": 1e-06, + "loss": 0.2677, + "mean_token_accuracy": 0.9020793437957764, + "num_tokens": 438292445.0, + "step": 12029 + }, + { + "epoch": 2.233983286908078, + "grad_norm": 1.677632212638855, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8735485672950745, + "num_tokens": 438331525.0, + "step": 12030 + }, + { + "epoch": 2.2341689879294337, + "grad_norm": 1.580047369003296, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8921154737472534, + "num_tokens": 438367811.0, + "step": 12031 + }, + { + "epoch": 2.234354688950789, + "grad_norm": 1.6692183017730713, + "learning_rate": 1e-06, + "loss": 0.2768, + "mean_token_accuracy": 0.8984342813491821, + "num_tokens": 438402165.0, + "step": 12032 + }, + { + "epoch": 2.2345403899721448, + "grad_norm": 1.6772632598876953, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.888961911201477, + "num_tokens": 438436367.0, + "step": 12033 + }, + { + "epoch": 2.2347260909935005, + "grad_norm": 1.4251786470413208, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8873013257980347, + "num_tokens": 438478678.0, + "step": 12034 + }, + { + "epoch": 2.2349117920148562, + "grad_norm": 1.6468636989593506, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8818671107292175, + "num_tokens": 438515210.0, + "step": 12035 + }, + { + "epoch": 2.2350974930362115, + "grad_norm": 1.518036961555481, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8961474299430847, + "num_tokens": 438551725.0, + "step": 12036 + }, + { + "epoch": 2.2352831940575673, + "grad_norm": 1.5529751777648926, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8984493613243103, + "num_tokens": 438590184.0, + "step": 12037 + }, + { + "epoch": 2.235468895078923, + "grad_norm": 1.5424162149429321, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8902097940444946, + "num_tokens": 438626975.0, + "step": 12038 + }, + { + "epoch": 2.2356545961002787, + "grad_norm": 1.5947314500808716, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8858866095542908, + "num_tokens": 438661251.0, + "step": 12039 + }, + { + "epoch": 2.235840297121634, + "grad_norm": 1.5591319799423218, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8868242502212524, + "num_tokens": 438700655.0, + "step": 12040 + }, + { + "epoch": 2.2360259981429897, + "grad_norm": 1.4290201663970947, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8937839269638062, + "num_tokens": 438741686.0, + "step": 12041 + }, + { + "epoch": 2.2362116991643455, + "grad_norm": 1.5789105892181396, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8867096900939941, + "num_tokens": 438777647.0, + "step": 12042 + }, + { + "epoch": 2.236397400185701, + "grad_norm": 1.7885525226593018, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8730811476707458, + "num_tokens": 438811068.0, + "step": 12043 + }, + { + "epoch": 2.2365831012070565, + "grad_norm": 1.734978199005127, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.890500545501709, + "num_tokens": 438845947.0, + "step": 12044 + }, + { + "epoch": 2.2367688022284122, + "grad_norm": 1.6089305877685547, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.884784996509552, + "num_tokens": 438885820.0, + "step": 12045 + }, + { + "epoch": 2.236954503249768, + "grad_norm": 1.4790351390838623, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.889096736907959, + "num_tokens": 438928699.0, + "step": 12046 + }, + { + "epoch": 2.2371402042711237, + "grad_norm": 1.6536201238632202, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8833155632019043, + "num_tokens": 438965526.0, + "step": 12047 + }, + { + "epoch": 2.237325905292479, + "grad_norm": 1.5776480436325073, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8724061250686646, + "num_tokens": 439006634.0, + "step": 12048 + }, + { + "epoch": 2.2375116063138347, + "grad_norm": 1.6267986297607422, + "learning_rate": 1e-06, + "loss": 0.2746, + "mean_token_accuracy": 0.8987874388694763, + "num_tokens": 439038657.0, + "step": 12049 + }, + { + "epoch": 2.2376973073351905, + "grad_norm": 1.737163782119751, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8855072259902954, + "num_tokens": 439073758.0, + "step": 12050 + }, + { + "epoch": 2.2378830083565457, + "grad_norm": 1.5013747215270996, + "learning_rate": 1e-06, + "loss": 0.2444, + "mean_token_accuracy": 0.9104204177856445, + "num_tokens": 439108866.0, + "step": 12051 + }, + { + "epoch": 2.2380687093779015, + "grad_norm": 1.670662760734558, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8845076560974121, + "num_tokens": 439142020.0, + "step": 12052 + }, + { + "epoch": 2.238254410399257, + "grad_norm": 1.586849570274353, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8834863901138306, + "num_tokens": 439178290.0, + "step": 12053 + }, + { + "epoch": 2.238440111420613, + "grad_norm": 1.6941720247268677, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.895051121711731, + "num_tokens": 439210821.0, + "step": 12054 + }, + { + "epoch": 2.2386258124419682, + "grad_norm": 1.651183843612671, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8907660245895386, + "num_tokens": 439244431.0, + "step": 12055 + }, + { + "epoch": 2.238811513463324, + "grad_norm": 1.587968349456787, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8926560878753662, + "num_tokens": 439283451.0, + "step": 12056 + }, + { + "epoch": 2.2389972144846797, + "grad_norm": 1.5110533237457275, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8809918165206909, + "num_tokens": 439326548.0, + "step": 12057 + }, + { + "epoch": 2.2391829155060354, + "grad_norm": 1.8401440382003784, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8764669895172119, + "num_tokens": 439358077.0, + "step": 12058 + }, + { + "epoch": 2.2393686165273907, + "grad_norm": 1.513156771659851, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8779094219207764, + "num_tokens": 439400130.0, + "step": 12059 + }, + { + "epoch": 2.2395543175487465, + "grad_norm": 1.5644367933273315, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8981112241744995, + "num_tokens": 439435980.0, + "step": 12060 + }, + { + "epoch": 2.239740018570102, + "grad_norm": 1.7044602632522583, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8804761171340942, + "num_tokens": 439469913.0, + "step": 12061 + }, + { + "epoch": 2.239925719591458, + "grad_norm": 1.6870218515396118, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8725147247314453, + "num_tokens": 439505407.0, + "step": 12062 + }, + { + "epoch": 2.240111420612813, + "grad_norm": 1.6853381395339966, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8796777129173279, + "num_tokens": 439541742.0, + "step": 12063 + }, + { + "epoch": 2.240297121634169, + "grad_norm": 1.612331509590149, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.895574152469635, + "num_tokens": 439579012.0, + "step": 12064 + }, + { + "epoch": 2.2404828226555247, + "grad_norm": 1.6476908922195435, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8922565579414368, + "num_tokens": 439616016.0, + "step": 12065 + }, + { + "epoch": 2.2406685236768804, + "grad_norm": 1.5439257621765137, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8865566253662109, + "num_tokens": 439654555.0, + "step": 12066 + }, + { + "epoch": 2.2408542246982357, + "grad_norm": 1.7663079500198364, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8862903118133545, + "num_tokens": 439688086.0, + "step": 12067 + }, + { + "epoch": 2.2410399257195914, + "grad_norm": 1.6072384119033813, + "learning_rate": 1e-06, + "loss": 0.2655, + "mean_token_accuracy": 0.9039009213447571, + "num_tokens": 439721250.0, + "step": 12068 + }, + { + "epoch": 2.241225626740947, + "grad_norm": 1.668611764907837, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8880724906921387, + "num_tokens": 439752167.0, + "step": 12069 + }, + { + "epoch": 2.241411327762303, + "grad_norm": 1.6503766775131226, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8802834749221802, + "num_tokens": 439789519.0, + "step": 12070 + }, + { + "epoch": 2.241597028783658, + "grad_norm": 1.7773350477218628, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8775196075439453, + "num_tokens": 439821930.0, + "step": 12071 + }, + { + "epoch": 2.241782729805014, + "grad_norm": 1.771637201309204, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8870221376419067, + "num_tokens": 439855512.0, + "step": 12072 + }, + { + "epoch": 2.2419684308263697, + "grad_norm": 1.4056917428970337, + "learning_rate": 1e-06, + "loss": 0.2491, + "mean_token_accuracy": 0.9080857634544373, + "num_tokens": 439894413.0, + "step": 12073 + }, + { + "epoch": 2.242154131847725, + "grad_norm": 1.5311733484268188, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8857146501541138, + "num_tokens": 439937473.0, + "step": 12074 + }, + { + "epoch": 2.2423398328690807, + "grad_norm": 1.790419578552246, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8798652291297913, + "num_tokens": 439967331.0, + "step": 12075 + }, + { + "epoch": 2.2425255338904364, + "grad_norm": 1.52906334400177, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.8981838822364807, + "num_tokens": 440005314.0, + "step": 12076 + }, + { + "epoch": 2.242711234911792, + "grad_norm": 1.5515077114105225, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8868409395217896, + "num_tokens": 440045576.0, + "step": 12077 + }, + { + "epoch": 2.2428969359331474, + "grad_norm": 1.7411370277404785, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8795566558837891, + "num_tokens": 440079932.0, + "step": 12078 + }, + { + "epoch": 2.243082636954503, + "grad_norm": 1.7423804998397827, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8867861032485962, + "num_tokens": 440114553.0, + "step": 12079 + }, + { + "epoch": 2.243268337975859, + "grad_norm": 1.6519358158111572, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8921387195587158, + "num_tokens": 440147812.0, + "step": 12080 + }, + { + "epoch": 2.2434540389972146, + "grad_norm": 1.702936053276062, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8760850429534912, + "num_tokens": 440184420.0, + "step": 12081 + }, + { + "epoch": 2.24363974001857, + "grad_norm": 1.605139136314392, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8810592293739319, + "num_tokens": 440224327.0, + "step": 12082 + }, + { + "epoch": 2.2438254410399256, + "grad_norm": 1.6180986166000366, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8848230838775635, + "num_tokens": 440262711.0, + "step": 12083 + }, + { + "epoch": 2.2440111420612814, + "grad_norm": 1.8063452243804932, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8803492188453674, + "num_tokens": 440294225.0, + "step": 12084 + }, + { + "epoch": 2.244196843082637, + "grad_norm": 1.7375885248184204, + "learning_rate": 1e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.8990427851676941, + "num_tokens": 440325294.0, + "step": 12085 + }, + { + "epoch": 2.2443825441039924, + "grad_norm": 1.7083830833435059, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8881062269210815, + "num_tokens": 440355017.0, + "step": 12086 + }, + { + "epoch": 2.244568245125348, + "grad_norm": 1.4853652715682983, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.8972381353378296, + "num_tokens": 440391202.0, + "step": 12087 + }, + { + "epoch": 2.244753946146704, + "grad_norm": 1.6512534618377686, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8820220828056335, + "num_tokens": 440427613.0, + "step": 12088 + }, + { + "epoch": 2.2449396471680596, + "grad_norm": 1.4895373582839966, + "learning_rate": 1e-06, + "loss": 0.283, + "mean_token_accuracy": 0.8963539600372314, + "num_tokens": 440470432.0, + "step": 12089 + }, + { + "epoch": 2.245125348189415, + "grad_norm": 1.6932523250579834, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8819593191146851, + "num_tokens": 440505141.0, + "step": 12090 + }, + { + "epoch": 2.2453110492107706, + "grad_norm": 1.669028639793396, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8862918615341187, + "num_tokens": 440539690.0, + "step": 12091 + }, + { + "epoch": 2.2454967502321264, + "grad_norm": 1.6118097305297852, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8926825523376465, + "num_tokens": 440575014.0, + "step": 12092 + }, + { + "epoch": 2.245682451253482, + "grad_norm": 1.731198787689209, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8878706693649292, + "num_tokens": 440612092.0, + "step": 12093 + }, + { + "epoch": 2.2458681522748374, + "grad_norm": 1.6685680150985718, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8892254829406738, + "num_tokens": 440647633.0, + "step": 12094 + }, + { + "epoch": 2.246053853296193, + "grad_norm": 1.6412718296051025, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8925589919090271, + "num_tokens": 440680675.0, + "step": 12095 + }, + { + "epoch": 2.246239554317549, + "grad_norm": 1.6713314056396484, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8816921710968018, + "num_tokens": 440716113.0, + "step": 12096 + }, + { + "epoch": 2.246425255338904, + "grad_norm": 1.5882797241210938, + "learning_rate": 1e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.9008581042289734, + "num_tokens": 440749053.0, + "step": 12097 + }, + { + "epoch": 2.24661095636026, + "grad_norm": 1.6339216232299805, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8853302001953125, + "num_tokens": 440785766.0, + "step": 12098 + }, + { + "epoch": 2.2467966573816156, + "grad_norm": 1.7307628393173218, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.866613507270813, + "num_tokens": 440826010.0, + "step": 12099 + }, + { + "epoch": 2.2469823584029713, + "grad_norm": 1.6620668172836304, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8926619291305542, + "num_tokens": 440860632.0, + "step": 12100 + }, + { + "epoch": 2.247168059424327, + "grad_norm": 1.6795930862426758, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8844674229621887, + "num_tokens": 440895036.0, + "step": 12101 + }, + { + "epoch": 2.2473537604456824, + "grad_norm": 1.5307180881500244, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8729909062385559, + "num_tokens": 440941368.0, + "step": 12102 + }, + { + "epoch": 2.247539461467038, + "grad_norm": 1.6937212944030762, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8865270614624023, + "num_tokens": 440977714.0, + "step": 12103 + }, + { + "epoch": 2.247725162488394, + "grad_norm": 1.6065038442611694, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8807187676429749, + "num_tokens": 441016188.0, + "step": 12104 + }, + { + "epoch": 2.247910863509749, + "grad_norm": 1.6517763137817383, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8867624998092651, + "num_tokens": 441052648.0, + "step": 12105 + }, + { + "epoch": 2.248096564531105, + "grad_norm": 1.643633246421814, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8886725902557373, + "num_tokens": 441087612.0, + "step": 12106 + }, + { + "epoch": 2.2482822655524606, + "grad_norm": 1.4854121208190918, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8807844519615173, + "num_tokens": 441134610.0, + "step": 12107 + }, + { + "epoch": 2.2484679665738163, + "grad_norm": 1.6308726072311401, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8777061104774475, + "num_tokens": 441175830.0, + "step": 12108 + }, + { + "epoch": 2.2486536675951716, + "grad_norm": 1.655959129333496, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.892087459564209, + "num_tokens": 441211655.0, + "step": 12109 + }, + { + "epoch": 2.2488393686165273, + "grad_norm": 1.7524856328964233, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8861979246139526, + "num_tokens": 441244732.0, + "step": 12110 + }, + { + "epoch": 2.249025069637883, + "grad_norm": 1.6930999755859375, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8925161957740784, + "num_tokens": 441275370.0, + "step": 12111 + }, + { + "epoch": 2.249210770659239, + "grad_norm": 1.772546410560608, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8673087358474731, + "num_tokens": 441312131.0, + "step": 12112 + }, + { + "epoch": 2.249396471680594, + "grad_norm": 1.6056102514266968, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8966784477233887, + "num_tokens": 441347022.0, + "step": 12113 + }, + { + "epoch": 2.24958217270195, + "grad_norm": 1.6594685316085815, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8773345947265625, + "num_tokens": 441382796.0, + "step": 12114 + }, + { + "epoch": 2.2497678737233056, + "grad_norm": 1.5699936151504517, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8909082412719727, + "num_tokens": 441421793.0, + "step": 12115 + }, + { + "epoch": 2.2499535747446613, + "grad_norm": 1.6340688467025757, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.891251802444458, + "num_tokens": 441456661.0, + "step": 12116 + }, + { + "epoch": 2.2501392757660166, + "grad_norm": 1.4697082042694092, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.890083909034729, + "num_tokens": 441496950.0, + "step": 12117 + }, + { + "epoch": 2.2503249767873723, + "grad_norm": 1.8246043920516968, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8751590251922607, + "num_tokens": 441532144.0, + "step": 12118 + }, + { + "epoch": 2.250510677808728, + "grad_norm": 1.5933406352996826, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8941242694854736, + "num_tokens": 441568065.0, + "step": 12119 + }, + { + "epoch": 2.2506963788300833, + "grad_norm": 1.7446552515029907, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8851956129074097, + "num_tokens": 441600337.0, + "step": 12120 + }, + { + "epoch": 2.250882079851439, + "grad_norm": 1.6361092329025269, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8848400115966797, + "num_tokens": 441636814.0, + "step": 12121 + }, + { + "epoch": 2.251067780872795, + "grad_norm": 1.5591222047805786, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.886633574962616, + "num_tokens": 441675673.0, + "step": 12122 + }, + { + "epoch": 2.2512534818941505, + "grad_norm": 1.5738377571105957, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8857765197753906, + "num_tokens": 441713766.0, + "step": 12123 + }, + { + "epoch": 2.2514391829155063, + "grad_norm": 1.4986023902893066, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8866628408432007, + "num_tokens": 441757602.0, + "step": 12124 + }, + { + "epoch": 2.2516248839368616, + "grad_norm": 1.7098835706710815, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8727623224258423, + "num_tokens": 441792439.0, + "step": 12125 + }, + { + "epoch": 2.2518105849582173, + "grad_norm": 1.6146211624145508, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8787676095962524, + "num_tokens": 441830425.0, + "step": 12126 + }, + { + "epoch": 2.251996285979573, + "grad_norm": 1.6940691471099854, + "learning_rate": 1e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.898920476436615, + "num_tokens": 441861160.0, + "step": 12127 + }, + { + "epoch": 2.2521819870009283, + "grad_norm": 1.746222972869873, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8701868057250977, + "num_tokens": 441898698.0, + "step": 12128 + }, + { + "epoch": 2.252367688022284, + "grad_norm": 1.56437087059021, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8857353925704956, + "num_tokens": 441937982.0, + "step": 12129 + }, + { + "epoch": 2.2525533890436398, + "grad_norm": 1.5273412466049194, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8818337321281433, + "num_tokens": 441976127.0, + "step": 12130 + }, + { + "epoch": 2.2527390900649955, + "grad_norm": 1.5069119930267334, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8857041001319885, + "num_tokens": 442016092.0, + "step": 12131 + }, + { + "epoch": 2.252924791086351, + "grad_norm": 1.5506041049957275, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8754897713661194, + "num_tokens": 442058942.0, + "step": 12132 + }, + { + "epoch": 2.2531104921077065, + "grad_norm": 1.612128496170044, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8857111930847168, + "num_tokens": 442096970.0, + "step": 12133 + }, + { + "epoch": 2.2532961931290623, + "grad_norm": 1.659293293952942, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8795027732849121, + "num_tokens": 442131451.0, + "step": 12134 + }, + { + "epoch": 2.253481894150418, + "grad_norm": 1.5992047786712646, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8713375329971313, + "num_tokens": 442173887.0, + "step": 12135 + }, + { + "epoch": 2.2536675951717733, + "grad_norm": 1.5306859016418457, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.881587028503418, + "num_tokens": 442212557.0, + "step": 12136 + }, + { + "epoch": 2.253853296193129, + "grad_norm": 1.5485758781433105, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8945510387420654, + "num_tokens": 442249848.0, + "step": 12137 + }, + { + "epoch": 2.2540389972144848, + "grad_norm": 1.9112634658813477, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8791524171829224, + "num_tokens": 442279520.0, + "step": 12138 + }, + { + "epoch": 2.2542246982358405, + "grad_norm": 1.5950472354888916, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8845507502555847, + "num_tokens": 442316855.0, + "step": 12139 + }, + { + "epoch": 2.2544103992571958, + "grad_norm": 1.6582448482513428, + "learning_rate": 1e-06, + "loss": 0.2809, + "mean_token_accuracy": 0.8984604477882385, + "num_tokens": 442351950.0, + "step": 12140 + }, + { + "epoch": 2.2545961002785515, + "grad_norm": 1.5640496015548706, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8815911412239075, + "num_tokens": 442390052.0, + "step": 12141 + }, + { + "epoch": 2.2547818012999072, + "grad_norm": 1.5178184509277344, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8853395581245422, + "num_tokens": 442429398.0, + "step": 12142 + }, + { + "epoch": 2.2549675023212625, + "grad_norm": 1.5847833156585693, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8853356838226318, + "num_tokens": 442467455.0, + "step": 12143 + }, + { + "epoch": 2.2551532033426183, + "grad_norm": 1.7534607648849487, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8872516751289368, + "num_tokens": 442499438.0, + "step": 12144 + }, + { + "epoch": 2.255338904363974, + "grad_norm": 1.7192673683166504, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.885097861289978, + "num_tokens": 442535581.0, + "step": 12145 + }, + { + "epoch": 2.2555246053853297, + "grad_norm": 1.5235689878463745, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8845407962799072, + "num_tokens": 442573767.0, + "step": 12146 + }, + { + "epoch": 2.2557103064066855, + "grad_norm": 1.8290191888809204, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.881644606590271, + "num_tokens": 442605482.0, + "step": 12147 + }, + { + "epoch": 2.2558960074280408, + "grad_norm": 1.5978546142578125, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8948381543159485, + "num_tokens": 442643941.0, + "step": 12148 + }, + { + "epoch": 2.2560817084493965, + "grad_norm": 1.636311650276184, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8899217844009399, + "num_tokens": 442677641.0, + "step": 12149 + }, + { + "epoch": 2.256267409470752, + "grad_norm": 1.7128621339797974, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8893805146217346, + "num_tokens": 442711898.0, + "step": 12150 + }, + { + "epoch": 2.2564531104921075, + "grad_norm": 1.6670140027999878, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8942646980285645, + "num_tokens": 442746358.0, + "step": 12151 + }, + { + "epoch": 2.2566388115134632, + "grad_norm": 1.618577480316162, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.887345552444458, + "num_tokens": 442784982.0, + "step": 12152 + }, + { + "epoch": 2.256824512534819, + "grad_norm": 1.6579997539520264, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8709177374839783, + "num_tokens": 442825969.0, + "step": 12153 + }, + { + "epoch": 2.2570102135561747, + "grad_norm": 1.797455072402954, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8890132904052734, + "num_tokens": 442857657.0, + "step": 12154 + }, + { + "epoch": 2.25719591457753, + "grad_norm": 1.6104933023452759, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8927857875823975, + "num_tokens": 442890375.0, + "step": 12155 + }, + { + "epoch": 2.2573816155988857, + "grad_norm": 1.7400456666946411, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8837811946868896, + "num_tokens": 442927013.0, + "step": 12156 + }, + { + "epoch": 2.2575673166202415, + "grad_norm": 1.4951967000961304, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8846691846847534, + "num_tokens": 442966703.0, + "step": 12157 + }, + { + "epoch": 2.257753017641597, + "grad_norm": 1.5894088745117188, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8882101774215698, + "num_tokens": 443004383.0, + "step": 12158 + }, + { + "epoch": 2.2579387186629525, + "grad_norm": 1.6946617364883423, + "learning_rate": 1e-06, + "loss": 0.267, + "mean_token_accuracy": 0.903643786907196, + "num_tokens": 443033980.0, + "step": 12159 + }, + { + "epoch": 2.258124419684308, + "grad_norm": 1.6122510433197021, + "learning_rate": 1e-06, + "loss": 0.2695, + "mean_token_accuracy": 0.9017478227615356, + "num_tokens": 443067397.0, + "step": 12160 + }, + { + "epoch": 2.258310120705664, + "grad_norm": 1.7153598070144653, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.879281222820282, + "num_tokens": 443103505.0, + "step": 12161 + }, + { + "epoch": 2.2584958217270197, + "grad_norm": 1.5815190076828003, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8949681520462036, + "num_tokens": 443139662.0, + "step": 12162 + }, + { + "epoch": 2.258681522748375, + "grad_norm": 1.6512367725372314, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8776710033416748, + "num_tokens": 443175494.0, + "step": 12163 + }, + { + "epoch": 2.2588672237697307, + "grad_norm": 1.5516551733016968, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8784786462783813, + "num_tokens": 443216959.0, + "step": 12164 + }, + { + "epoch": 2.2590529247910864, + "grad_norm": 1.6286125183105469, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8856953382492065, + "num_tokens": 443251712.0, + "step": 12165 + }, + { + "epoch": 2.2592386258124417, + "grad_norm": 1.6499152183532715, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8811064958572388, + "num_tokens": 443288019.0, + "step": 12166 + }, + { + "epoch": 2.2594243268337975, + "grad_norm": 1.6051017045974731, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8889294862747192, + "num_tokens": 443322648.0, + "step": 12167 + }, + { + "epoch": 2.259610027855153, + "grad_norm": 1.5490028858184814, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8840417265892029, + "num_tokens": 443363385.0, + "step": 12168 + }, + { + "epoch": 2.259795728876509, + "grad_norm": 1.567307472229004, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8869941234588623, + "num_tokens": 443400265.0, + "step": 12169 + }, + { + "epoch": 2.2599814298978647, + "grad_norm": 1.6232140064239502, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8895079493522644, + "num_tokens": 443438131.0, + "step": 12170 + }, + { + "epoch": 2.26016713091922, + "grad_norm": 1.721432089805603, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8733785152435303, + "num_tokens": 443472638.0, + "step": 12171 + }, + { + "epoch": 2.2603528319405757, + "grad_norm": 1.633495569229126, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8902738690376282, + "num_tokens": 443507543.0, + "step": 12172 + }, + { + "epoch": 2.2605385329619314, + "grad_norm": 1.460242509841919, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8919602632522583, + "num_tokens": 443547116.0, + "step": 12173 + }, + { + "epoch": 2.2607242339832867, + "grad_norm": 1.5496177673339844, + "learning_rate": 1e-06, + "loss": 0.2867, + "mean_token_accuracy": 0.8960186243057251, + "num_tokens": 443582779.0, + "step": 12174 + }, + { + "epoch": 2.2609099350046424, + "grad_norm": 1.6105880737304688, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8828598260879517, + "num_tokens": 443620843.0, + "step": 12175 + }, + { + "epoch": 2.261095636025998, + "grad_norm": 1.5755342245101929, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8893663883209229, + "num_tokens": 443660022.0, + "step": 12176 + }, + { + "epoch": 2.261281337047354, + "grad_norm": 1.632311463356018, + "learning_rate": 1e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.8978694677352905, + "num_tokens": 443692127.0, + "step": 12177 + }, + { + "epoch": 2.261467038068709, + "grad_norm": 1.5136055946350098, + "learning_rate": 1e-06, + "loss": 0.2771, + "mean_token_accuracy": 0.8991808891296387, + "num_tokens": 443732308.0, + "step": 12178 + }, + { + "epoch": 2.261652739090065, + "grad_norm": 1.7388885021209717, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8854153156280518, + "num_tokens": 443764577.0, + "step": 12179 + }, + { + "epoch": 2.2618384401114207, + "grad_norm": 1.593487024307251, + "learning_rate": 1e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.8961141109466553, + "num_tokens": 443798406.0, + "step": 12180 + }, + { + "epoch": 2.2620241411327764, + "grad_norm": 1.5357468128204346, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8850102424621582, + "num_tokens": 443837132.0, + "step": 12181 + }, + { + "epoch": 2.2622098421541317, + "grad_norm": 1.5168077945709229, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.882117748260498, + "num_tokens": 443877380.0, + "step": 12182 + }, + { + "epoch": 2.2623955431754874, + "grad_norm": 1.8475443124771118, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8790793418884277, + "num_tokens": 443908460.0, + "step": 12183 + }, + { + "epoch": 2.262581244196843, + "grad_norm": 1.5625386238098145, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8847760558128357, + "num_tokens": 443948831.0, + "step": 12184 + }, + { + "epoch": 2.262766945218199, + "grad_norm": 1.7181309461593628, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8914715647697449, + "num_tokens": 443979958.0, + "step": 12185 + }, + { + "epoch": 2.262952646239554, + "grad_norm": 1.6197189092636108, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8940306901931763, + "num_tokens": 444012728.0, + "step": 12186 + }, + { + "epoch": 2.26313834726091, + "grad_norm": 1.630598783493042, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8777846693992615, + "num_tokens": 444049748.0, + "step": 12187 + }, + { + "epoch": 2.2633240482822656, + "grad_norm": 1.7942856550216675, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8721544742584229, + "num_tokens": 444086710.0, + "step": 12188 + }, + { + "epoch": 2.263509749303621, + "grad_norm": 1.6500859260559082, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8940788507461548, + "num_tokens": 444121738.0, + "step": 12189 + }, + { + "epoch": 2.2636954503249767, + "grad_norm": 1.6015344858169556, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8941130638122559, + "num_tokens": 444158867.0, + "step": 12190 + }, + { + "epoch": 2.2638811513463324, + "grad_norm": 1.7471140623092651, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8755882978439331, + "num_tokens": 444190463.0, + "step": 12191 + }, + { + "epoch": 2.264066852367688, + "grad_norm": 1.7082048654556274, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8772674798965454, + "num_tokens": 444227270.0, + "step": 12192 + }, + { + "epoch": 2.264252553389044, + "grad_norm": 1.5087705850601196, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8932520151138306, + "num_tokens": 444264589.0, + "step": 12193 + }, + { + "epoch": 2.264438254410399, + "grad_norm": 1.6566215753555298, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8972254991531372, + "num_tokens": 444297629.0, + "step": 12194 + }, + { + "epoch": 2.264623955431755, + "grad_norm": 1.5934944152832031, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8836214542388916, + "num_tokens": 444335286.0, + "step": 12195 + }, + { + "epoch": 2.2648096564531106, + "grad_norm": 1.5297341346740723, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8864912986755371, + "num_tokens": 444376967.0, + "step": 12196 + }, + { + "epoch": 2.264995357474466, + "grad_norm": 1.5975322723388672, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.881942868232727, + "num_tokens": 444413962.0, + "step": 12197 + }, + { + "epoch": 2.2651810584958216, + "grad_norm": 1.5105130672454834, + "learning_rate": 1e-06, + "loss": 0.2752, + "mean_token_accuracy": 0.904822826385498, + "num_tokens": 444454697.0, + "step": 12198 + }, + { + "epoch": 2.2653667595171774, + "grad_norm": 1.5439453125, + "learning_rate": 1e-06, + "loss": 0.2774, + "mean_token_accuracy": 0.9018486738204956, + "num_tokens": 444487947.0, + "step": 12199 + }, + { + "epoch": 2.265552460538533, + "grad_norm": 1.6476861238479614, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.883934497833252, + "num_tokens": 444522697.0, + "step": 12200 + }, + { + "epoch": 2.2657381615598884, + "grad_norm": 1.7189494371414185, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8878897428512573, + "num_tokens": 444554467.0, + "step": 12201 + }, + { + "epoch": 2.265923862581244, + "grad_norm": 1.5635850429534912, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8971823453903198, + "num_tokens": 444592983.0, + "step": 12202 + }, + { + "epoch": 2.2661095636026, + "grad_norm": 1.7030909061431885, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8750842809677124, + "num_tokens": 444627566.0, + "step": 12203 + }, + { + "epoch": 2.2662952646239556, + "grad_norm": 1.5606417655944824, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8913741111755371, + "num_tokens": 444665375.0, + "step": 12204 + }, + { + "epoch": 2.266480965645311, + "grad_norm": 1.5823791027069092, + "learning_rate": 1e-06, + "loss": 0.2597, + "mean_token_accuracy": 0.9036544561386108, + "num_tokens": 444699279.0, + "step": 12205 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 1.6294399499893188, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.89702969789505, + "num_tokens": 444732078.0, + "step": 12206 + }, + { + "epoch": 2.2668523676880223, + "grad_norm": 1.631932020187378, + "learning_rate": 1e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9010368585586548, + "num_tokens": 444763775.0, + "step": 12207 + }, + { + "epoch": 2.267038068709378, + "grad_norm": 1.4384797811508179, + "learning_rate": 1e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.8951954245567322, + "num_tokens": 444803159.0, + "step": 12208 + }, + { + "epoch": 2.2672237697307334, + "grad_norm": 1.6060335636138916, + "learning_rate": 1e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.8955117464065552, + "num_tokens": 444838369.0, + "step": 12209 + }, + { + "epoch": 2.267409470752089, + "grad_norm": 1.6280583143234253, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8842586278915405, + "num_tokens": 444874015.0, + "step": 12210 + }, + { + "epoch": 2.267595171773445, + "grad_norm": 1.6779955625534058, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8713890314102173, + "num_tokens": 444912772.0, + "step": 12211 + }, + { + "epoch": 2.2677808727948006, + "grad_norm": 1.6112266778945923, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8706010580062866, + "num_tokens": 444951748.0, + "step": 12212 + }, + { + "epoch": 2.267966573816156, + "grad_norm": 1.5543675422668457, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8972881436347961, + "num_tokens": 444988680.0, + "step": 12213 + }, + { + "epoch": 2.2681522748375116, + "grad_norm": 1.575777530670166, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8865476250648499, + "num_tokens": 445027866.0, + "step": 12214 + }, + { + "epoch": 2.2683379758588673, + "grad_norm": 1.646848201751709, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8754799365997314, + "num_tokens": 445062887.0, + "step": 12215 + }, + { + "epoch": 2.268523676880223, + "grad_norm": 1.5132696628570557, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8948651552200317, + "num_tokens": 445101722.0, + "step": 12216 + }, + { + "epoch": 2.2687093779015783, + "grad_norm": 1.75722336769104, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8962218165397644, + "num_tokens": 445130325.0, + "step": 12217 + }, + { + "epoch": 2.268895078922934, + "grad_norm": 1.6253012418746948, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8785885572433472, + "num_tokens": 445165861.0, + "step": 12218 + }, + { + "epoch": 2.26908077994429, + "grad_norm": 1.5173598527908325, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.889045000076294, + "num_tokens": 445207261.0, + "step": 12219 + }, + { + "epoch": 2.269266480965645, + "grad_norm": 1.7345579862594604, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8990951180458069, + "num_tokens": 445237640.0, + "step": 12220 + }, + { + "epoch": 2.269452181987001, + "grad_norm": 1.6777503490447998, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8874887228012085, + "num_tokens": 445274082.0, + "step": 12221 + }, + { + "epoch": 2.2696378830083566, + "grad_norm": 1.5831782817840576, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8897483348846436, + "num_tokens": 445309841.0, + "step": 12222 + }, + { + "epoch": 2.2698235840297123, + "grad_norm": 1.6595537662506104, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8788532614707947, + "num_tokens": 445346353.0, + "step": 12223 + }, + { + "epoch": 2.270009285051068, + "grad_norm": 1.5927413702011108, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8843166828155518, + "num_tokens": 445384158.0, + "step": 12224 + }, + { + "epoch": 2.2701949860724233, + "grad_norm": 1.723556637763977, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8746612668037415, + "num_tokens": 445418729.0, + "step": 12225 + }, + { + "epoch": 2.270380687093779, + "grad_norm": 1.7412010431289673, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8844664692878723, + "num_tokens": 445448728.0, + "step": 12226 + }, + { + "epoch": 2.270566388115135, + "grad_norm": 1.4915388822555542, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8922603726387024, + "num_tokens": 445489400.0, + "step": 12227 + }, + { + "epoch": 2.27075208913649, + "grad_norm": 1.6880370378494263, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8691884279251099, + "num_tokens": 445526944.0, + "step": 12228 + }, + { + "epoch": 2.270937790157846, + "grad_norm": 1.5984643697738647, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8872168064117432, + "num_tokens": 445562831.0, + "step": 12229 + }, + { + "epoch": 2.2711234911792015, + "grad_norm": 1.9030063152313232, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8647618293762207, + "num_tokens": 445597128.0, + "step": 12230 + }, + { + "epoch": 2.2713091922005573, + "grad_norm": 1.84322190284729, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8777632713317871, + "num_tokens": 445634762.0, + "step": 12231 + }, + { + "epoch": 2.2714948932219126, + "grad_norm": 1.5857274532318115, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8883963227272034, + "num_tokens": 445672116.0, + "step": 12232 + }, + { + "epoch": 2.2716805942432683, + "grad_norm": 1.5141489505767822, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8958078026771545, + "num_tokens": 445711489.0, + "step": 12233 + }, + { + "epoch": 2.271866295264624, + "grad_norm": 1.5288639068603516, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8920905590057373, + "num_tokens": 445749815.0, + "step": 12234 + }, + { + "epoch": 2.2720519962859798, + "grad_norm": 1.5203081369400024, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.893810510635376, + "num_tokens": 445787901.0, + "step": 12235 + }, + { + "epoch": 2.272237697307335, + "grad_norm": 1.3998361825942993, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8884817361831665, + "num_tokens": 445832643.0, + "step": 12236 + }, + { + "epoch": 2.272423398328691, + "grad_norm": 1.4646166563034058, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8933290243148804, + "num_tokens": 445874187.0, + "step": 12237 + }, + { + "epoch": 2.2726090993500465, + "grad_norm": 1.5431113243103027, + "learning_rate": 1e-06, + "loss": 0.2719, + "mean_token_accuracy": 0.9045425653457642, + "num_tokens": 445909120.0, + "step": 12238 + }, + { + "epoch": 2.2727948003714022, + "grad_norm": 1.5904191732406616, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8798688650131226, + "num_tokens": 445948066.0, + "step": 12239 + }, + { + "epoch": 2.2729805013927575, + "grad_norm": 1.614549160003662, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8818004131317139, + "num_tokens": 445982404.0, + "step": 12240 + }, + { + "epoch": 2.2731662024141133, + "grad_norm": 1.5754863023757935, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.881304919719696, + "num_tokens": 446022969.0, + "step": 12241 + }, + { + "epoch": 2.273351903435469, + "grad_norm": 1.6409151554107666, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.899998664855957, + "num_tokens": 446054562.0, + "step": 12242 + }, + { + "epoch": 2.2735376044568243, + "grad_norm": 1.682515025138855, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8732095956802368, + "num_tokens": 446089928.0, + "step": 12243 + }, + { + "epoch": 2.27372330547818, + "grad_norm": 1.6994754076004028, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8868362307548523, + "num_tokens": 446122394.0, + "step": 12244 + }, + { + "epoch": 2.2739090064995358, + "grad_norm": 1.6260803937911987, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8831385970115662, + "num_tokens": 446160197.0, + "step": 12245 + }, + { + "epoch": 2.2740947075208915, + "grad_norm": 1.5346176624298096, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.897314190864563, + "num_tokens": 446198959.0, + "step": 12246 + }, + { + "epoch": 2.2742804085422472, + "grad_norm": 1.5843981504440308, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8952833414077759, + "num_tokens": 446236478.0, + "step": 12247 + }, + { + "epoch": 2.2744661095636025, + "grad_norm": 1.5551302433013916, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8835564851760864, + "num_tokens": 446276834.0, + "step": 12248 + }, + { + "epoch": 2.2746518105849582, + "grad_norm": 1.739469289779663, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8862789869308472, + "num_tokens": 446311600.0, + "step": 12249 + }, + { + "epoch": 2.274837511606314, + "grad_norm": 1.6419901847839355, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8817942142486572, + "num_tokens": 446348318.0, + "step": 12250 + }, + { + "epoch": 2.2750232126276693, + "grad_norm": 1.625144362449646, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8805237412452698, + "num_tokens": 446387464.0, + "step": 12251 + }, + { + "epoch": 2.275208913649025, + "grad_norm": 1.572208285331726, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8853603005409241, + "num_tokens": 446424351.0, + "step": 12252 + }, + { + "epoch": 2.2753946146703807, + "grad_norm": 1.7677990198135376, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8946824669837952, + "num_tokens": 446453854.0, + "step": 12253 + }, + { + "epoch": 2.2755803156917365, + "grad_norm": 1.6854054927825928, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.874678909778595, + "num_tokens": 446493798.0, + "step": 12254 + }, + { + "epoch": 2.2757660167130918, + "grad_norm": 1.7055423259735107, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8820664286613464, + "num_tokens": 446527302.0, + "step": 12255 + }, + { + "epoch": 2.2759517177344475, + "grad_norm": 1.5419777631759644, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8800176382064819, + "num_tokens": 446565265.0, + "step": 12256 + }, + { + "epoch": 2.276137418755803, + "grad_norm": 1.671176791191101, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8835270404815674, + "num_tokens": 446598940.0, + "step": 12257 + }, + { + "epoch": 2.276323119777159, + "grad_norm": 1.5968197584152222, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8882884979248047, + "num_tokens": 446634336.0, + "step": 12258 + }, + { + "epoch": 2.2765088207985142, + "grad_norm": 1.593827486038208, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8856008052825928, + "num_tokens": 446671871.0, + "step": 12259 + }, + { + "epoch": 2.27669452181987, + "grad_norm": 1.6138895750045776, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8874694108963013, + "num_tokens": 446708212.0, + "step": 12260 + }, + { + "epoch": 2.2768802228412257, + "grad_norm": 1.567007303237915, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8806368112564087, + "num_tokens": 446745634.0, + "step": 12261 + }, + { + "epoch": 2.2770659238625814, + "grad_norm": 1.6417909860610962, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8879369497299194, + "num_tokens": 446778879.0, + "step": 12262 + }, + { + "epoch": 2.2772516248839367, + "grad_norm": 1.5644315481185913, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8915524482727051, + "num_tokens": 446818380.0, + "step": 12263 + }, + { + "epoch": 2.2774373259052925, + "grad_norm": 1.597964882850647, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.899686872959137, + "num_tokens": 446850898.0, + "step": 12264 + }, + { + "epoch": 2.277623026926648, + "grad_norm": 1.7240079641342163, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8659698367118835, + "num_tokens": 446887521.0, + "step": 12265 + }, + { + "epoch": 2.2778087279480035, + "grad_norm": 1.5363117456436157, + "learning_rate": 1e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.9010680913925171, + "num_tokens": 446921314.0, + "step": 12266 + }, + { + "epoch": 2.277994428969359, + "grad_norm": 1.6288948059082031, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8958597183227539, + "num_tokens": 446954959.0, + "step": 12267 + }, + { + "epoch": 2.278180129990715, + "grad_norm": 1.4985110759735107, + "learning_rate": 1e-06, + "loss": 0.2607, + "mean_token_accuracy": 0.9032965302467346, + "num_tokens": 446992567.0, + "step": 12268 + }, + { + "epoch": 2.2783658310120707, + "grad_norm": 1.6143815517425537, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8768376111984253, + "num_tokens": 447032992.0, + "step": 12269 + }, + { + "epoch": 2.2785515320334264, + "grad_norm": 1.7526209354400635, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8725868463516235, + "num_tokens": 447066532.0, + "step": 12270 + }, + { + "epoch": 2.2787372330547817, + "grad_norm": 1.5533581972122192, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.880353569984436, + "num_tokens": 447104845.0, + "step": 12271 + }, + { + "epoch": 2.2789229340761374, + "grad_norm": 1.6405396461486816, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8784245848655701, + "num_tokens": 447141662.0, + "step": 12272 + }, + { + "epoch": 2.279108635097493, + "grad_norm": 1.7279378175735474, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8931334614753723, + "num_tokens": 447175097.0, + "step": 12273 + }, + { + "epoch": 2.2792943361188485, + "grad_norm": 1.6516258716583252, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8832562565803528, + "num_tokens": 447211415.0, + "step": 12274 + }, + { + "epoch": 2.279480037140204, + "grad_norm": 1.5638755559921265, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8897807598114014, + "num_tokens": 447248173.0, + "step": 12275 + }, + { + "epoch": 2.27966573816156, + "grad_norm": 1.6669301986694336, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8830959796905518, + "num_tokens": 447282292.0, + "step": 12276 + }, + { + "epoch": 2.2798514391829157, + "grad_norm": 1.7469342947006226, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.88218092918396, + "num_tokens": 447313046.0, + "step": 12277 + }, + { + "epoch": 2.280037140204271, + "grad_norm": 1.5249507427215576, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8892803192138672, + "num_tokens": 447352145.0, + "step": 12278 + }, + { + "epoch": 2.2802228412256267, + "grad_norm": 1.6190823316574097, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.884921669960022, + "num_tokens": 447388389.0, + "step": 12279 + }, + { + "epoch": 2.2804085422469824, + "grad_norm": 1.7252110242843628, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8901442885398865, + "num_tokens": 447421446.0, + "step": 12280 + }, + { + "epoch": 2.280594243268338, + "grad_norm": 1.6341029405593872, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8776040077209473, + "num_tokens": 447458932.0, + "step": 12281 + }, + { + "epoch": 2.2807799442896934, + "grad_norm": 1.6713272333145142, + "learning_rate": 1e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.8999195098876953, + "num_tokens": 447494168.0, + "step": 12282 + }, + { + "epoch": 2.280965645311049, + "grad_norm": 1.528979778289795, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8916412591934204, + "num_tokens": 447530656.0, + "step": 12283 + }, + { + "epoch": 2.281151346332405, + "grad_norm": 1.6619632244110107, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8909235000610352, + "num_tokens": 447565522.0, + "step": 12284 + }, + { + "epoch": 2.2813370473537606, + "grad_norm": 1.6272151470184326, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8777302503585815, + "num_tokens": 447605244.0, + "step": 12285 + }, + { + "epoch": 2.281522748375116, + "grad_norm": 1.5064595937728882, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8916701078414917, + "num_tokens": 447646489.0, + "step": 12286 + }, + { + "epoch": 2.2817084493964717, + "grad_norm": 1.7160488367080688, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8803632259368896, + "num_tokens": 447682217.0, + "step": 12287 + }, + { + "epoch": 2.2818941504178274, + "grad_norm": 1.5731419324874878, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8874882459640503, + "num_tokens": 447718182.0, + "step": 12288 + }, + { + "epoch": 2.2820798514391827, + "grad_norm": 1.7176858186721802, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8880178928375244, + "num_tokens": 447752342.0, + "step": 12289 + }, + { + "epoch": 2.2822655524605384, + "grad_norm": 1.7012373208999634, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8814916610717773, + "num_tokens": 447784283.0, + "step": 12290 + }, + { + "epoch": 2.282451253481894, + "grad_norm": 1.764402151107788, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8753281831741333, + "num_tokens": 447818952.0, + "step": 12291 + }, + { + "epoch": 2.28263695450325, + "grad_norm": 1.7946934700012207, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.890343427658081, + "num_tokens": 447847032.0, + "step": 12292 + }, + { + "epoch": 2.2828226555246056, + "grad_norm": 1.776711344718933, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8930938243865967, + "num_tokens": 447878746.0, + "step": 12293 + }, + { + "epoch": 2.283008356545961, + "grad_norm": 1.7748417854309082, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8855844140052795, + "num_tokens": 447913584.0, + "step": 12294 + }, + { + "epoch": 2.2831940575673166, + "grad_norm": 1.873292326927185, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8808495402336121, + "num_tokens": 447945923.0, + "step": 12295 + }, + { + "epoch": 2.2833797585886724, + "grad_norm": 1.7797356843948364, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.891394853591919, + "num_tokens": 447974005.0, + "step": 12296 + }, + { + "epoch": 2.2835654596100277, + "grad_norm": 1.6202597618103027, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.890882134437561, + "num_tokens": 448010549.0, + "step": 12297 + }, + { + "epoch": 2.2837511606313834, + "grad_norm": 1.6000564098358154, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8790349960327148, + "num_tokens": 448051936.0, + "step": 12298 + }, + { + "epoch": 2.283936861652739, + "grad_norm": 1.7873972654342651, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8687677979469299, + "num_tokens": 448088509.0, + "step": 12299 + }, + { + "epoch": 2.284122562674095, + "grad_norm": 1.5791561603546143, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8907046914100647, + "num_tokens": 448126502.0, + "step": 12300 + }, + { + "epoch": 2.28430826369545, + "grad_norm": 1.7591181993484497, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8864773511886597, + "num_tokens": 448158135.0, + "step": 12301 + }, + { + "epoch": 2.284493964716806, + "grad_norm": 1.5058541297912598, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8871679306030273, + "num_tokens": 448198661.0, + "step": 12302 + }, + { + "epoch": 2.2846796657381616, + "grad_norm": 1.4779618978500366, + "learning_rate": 1e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.8972759246826172, + "num_tokens": 448235662.0, + "step": 12303 + }, + { + "epoch": 2.2848653667595173, + "grad_norm": 1.7632893323898315, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8911841511726379, + "num_tokens": 448266243.0, + "step": 12304 + }, + { + "epoch": 2.2850510677808726, + "grad_norm": 1.491428017616272, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8916250467300415, + "num_tokens": 448312553.0, + "step": 12305 + }, + { + "epoch": 2.2852367688022284, + "grad_norm": 1.476941704750061, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8929206132888794, + "num_tokens": 448351884.0, + "step": 12306 + }, + { + "epoch": 2.285422469823584, + "grad_norm": 1.5687369108200073, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8873388767242432, + "num_tokens": 448389813.0, + "step": 12307 + }, + { + "epoch": 2.28560817084494, + "grad_norm": 1.6377640962600708, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8822524547576904, + "num_tokens": 448428242.0, + "step": 12308 + }, + { + "epoch": 2.285793871866295, + "grad_norm": 1.6771879196166992, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8833177089691162, + "num_tokens": 448462218.0, + "step": 12309 + }, + { + "epoch": 2.285979572887651, + "grad_norm": 1.685900330543518, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8874151706695557, + "num_tokens": 448496388.0, + "step": 12310 + }, + { + "epoch": 2.2861652739090066, + "grad_norm": 1.82098388671875, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8781448602676392, + "num_tokens": 448525003.0, + "step": 12311 + }, + { + "epoch": 2.286350974930362, + "grad_norm": 1.4684562683105469, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8913401365280151, + "num_tokens": 448562686.0, + "step": 12312 + }, + { + "epoch": 2.2865366759517176, + "grad_norm": 1.6658581495285034, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.881049394607544, + "num_tokens": 448600617.0, + "step": 12313 + }, + { + "epoch": 2.2867223769730733, + "grad_norm": 1.6196402311325073, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8778577446937561, + "num_tokens": 448638841.0, + "step": 12314 + }, + { + "epoch": 2.286908077994429, + "grad_norm": 1.5815588235855103, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8819394111633301, + "num_tokens": 448677961.0, + "step": 12315 + }, + { + "epoch": 2.287093779015785, + "grad_norm": 1.4571020603179932, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8947290182113647, + "num_tokens": 448719524.0, + "step": 12316 + }, + { + "epoch": 2.28727948003714, + "grad_norm": 1.5551953315734863, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8893381953239441, + "num_tokens": 448758197.0, + "step": 12317 + }, + { + "epoch": 2.287465181058496, + "grad_norm": 1.4755040407180786, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8870675563812256, + "num_tokens": 448799440.0, + "step": 12318 + }, + { + "epoch": 2.2876508820798516, + "grad_norm": 1.820127248764038, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.887871265411377, + "num_tokens": 448828827.0, + "step": 12319 + }, + { + "epoch": 2.287836583101207, + "grad_norm": 1.7223491668701172, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8822237253189087, + "num_tokens": 448863257.0, + "step": 12320 + }, + { + "epoch": 2.2880222841225626, + "grad_norm": 1.6540358066558838, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8796539306640625, + "num_tokens": 448900784.0, + "step": 12321 + }, + { + "epoch": 2.2882079851439183, + "grad_norm": 1.7572826147079468, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8870916366577148, + "num_tokens": 448931879.0, + "step": 12322 + }, + { + "epoch": 2.288393686165274, + "grad_norm": 1.5540920495986938, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8910577893257141, + "num_tokens": 448967002.0, + "step": 12323 + }, + { + "epoch": 2.2885793871866293, + "grad_norm": 1.646852970123291, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8701188564300537, + "num_tokens": 449004362.0, + "step": 12324 + }, + { + "epoch": 2.288765088207985, + "grad_norm": 1.5207542181015015, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8884040117263794, + "num_tokens": 449043794.0, + "step": 12325 + }, + { + "epoch": 2.288950789229341, + "grad_norm": 1.6736823320388794, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8839176297187805, + "num_tokens": 449078592.0, + "step": 12326 + }, + { + "epoch": 2.2891364902506965, + "grad_norm": 1.6983835697174072, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8952268362045288, + "num_tokens": 449109347.0, + "step": 12327 + }, + { + "epoch": 2.289322191272052, + "grad_norm": 1.439510703086853, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.9005485773086548, + "num_tokens": 449148963.0, + "step": 12328 + }, + { + "epoch": 2.2895078922934076, + "grad_norm": 1.6903753280639648, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8839801549911499, + "num_tokens": 449187658.0, + "step": 12329 + }, + { + "epoch": 2.2896935933147633, + "grad_norm": 1.5036481618881226, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8881709575653076, + "num_tokens": 449228598.0, + "step": 12330 + }, + { + "epoch": 2.289879294336119, + "grad_norm": 1.6847599744796753, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8835498690605164, + "num_tokens": 449262996.0, + "step": 12331 + }, + { + "epoch": 2.2900649953574743, + "grad_norm": 1.6418622732162476, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8944231271743774, + "num_tokens": 449297120.0, + "step": 12332 + }, + { + "epoch": 2.29025069637883, + "grad_norm": 1.562039852142334, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8939681053161621, + "num_tokens": 449332750.0, + "step": 12333 + }, + { + "epoch": 2.290436397400186, + "grad_norm": 1.5498155355453491, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8880543112754822, + "num_tokens": 449371376.0, + "step": 12334 + }, + { + "epoch": 2.290622098421541, + "grad_norm": 1.6933941841125488, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8902056217193604, + "num_tokens": 449404678.0, + "step": 12335 + }, + { + "epoch": 2.290807799442897, + "grad_norm": 1.6838295459747314, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8965809345245361, + "num_tokens": 449434503.0, + "step": 12336 + }, + { + "epoch": 2.2909935004642525, + "grad_norm": 1.5371893644332886, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8865053653717041, + "num_tokens": 449474118.0, + "step": 12337 + }, + { + "epoch": 2.2911792014856083, + "grad_norm": 1.5450997352600098, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.879751443862915, + "num_tokens": 449513387.0, + "step": 12338 + }, + { + "epoch": 2.291364902506964, + "grad_norm": 1.5149152278900146, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8854317665100098, + "num_tokens": 449555073.0, + "step": 12339 + }, + { + "epoch": 2.2915506035283193, + "grad_norm": 1.4038114547729492, + "learning_rate": 1e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.8986844420433044, + "num_tokens": 449598072.0, + "step": 12340 + }, + { + "epoch": 2.291736304549675, + "grad_norm": 1.589025616645813, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8971265554428101, + "num_tokens": 449631242.0, + "step": 12341 + }, + { + "epoch": 2.2919220055710308, + "grad_norm": 1.7201396226882935, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8888826370239258, + "num_tokens": 449664002.0, + "step": 12342 + }, + { + "epoch": 2.292107706592386, + "grad_norm": 1.6087411642074585, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8825496435165405, + "num_tokens": 449703748.0, + "step": 12343 + }, + { + "epoch": 2.292293407613742, + "grad_norm": 1.5914239883422852, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8725082874298096, + "num_tokens": 449746683.0, + "step": 12344 + }, + { + "epoch": 2.2924791086350975, + "grad_norm": 1.6989057064056396, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8858076930046082, + "num_tokens": 449779659.0, + "step": 12345 + }, + { + "epoch": 2.2926648096564533, + "grad_norm": 1.4482837915420532, + "learning_rate": 1e-06, + "loss": 0.2739, + "mean_token_accuracy": 0.9001811146736145, + "num_tokens": 449822314.0, + "step": 12346 + }, + { + "epoch": 2.2928505106778085, + "grad_norm": 1.6563984155654907, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8872515559196472, + "num_tokens": 449856596.0, + "step": 12347 + }, + { + "epoch": 2.2930362116991643, + "grad_norm": 1.6714552640914917, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8750152587890625, + "num_tokens": 449894945.0, + "step": 12348 + }, + { + "epoch": 2.29322191272052, + "grad_norm": 1.705797553062439, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8947592973709106, + "num_tokens": 449927629.0, + "step": 12349 + }, + { + "epoch": 2.2934076137418757, + "grad_norm": 1.6781872510910034, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8892726898193359, + "num_tokens": 449964756.0, + "step": 12350 + }, + { + "epoch": 2.293593314763231, + "grad_norm": 1.6487399339675903, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8796859979629517, + "num_tokens": 450002206.0, + "step": 12351 + }, + { + "epoch": 2.2937790157845868, + "grad_norm": 1.6653656959533691, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.888113796710968, + "num_tokens": 450037210.0, + "step": 12352 + }, + { + "epoch": 2.2939647168059425, + "grad_norm": 1.5154565572738647, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8928040266036987, + "num_tokens": 450077861.0, + "step": 12353 + }, + { + "epoch": 2.2941504178272982, + "grad_norm": 1.8300102949142456, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8968104720115662, + "num_tokens": 450106718.0, + "step": 12354 + }, + { + "epoch": 2.2943361188486535, + "grad_norm": 1.5431712865829468, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8936137557029724, + "num_tokens": 450146533.0, + "step": 12355 + }, + { + "epoch": 2.2945218198700092, + "grad_norm": 1.7146438360214233, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8988706469535828, + "num_tokens": 450177344.0, + "step": 12356 + }, + { + "epoch": 2.294707520891365, + "grad_norm": 1.6401702165603638, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8734127283096313, + "num_tokens": 450212772.0, + "step": 12357 + }, + { + "epoch": 2.2948932219127203, + "grad_norm": 1.62815260887146, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.893570601940155, + "num_tokens": 450246707.0, + "step": 12358 + }, + { + "epoch": 2.295078922934076, + "grad_norm": 1.598121166229248, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8911009430885315, + "num_tokens": 450280999.0, + "step": 12359 + }, + { + "epoch": 2.2952646239554317, + "grad_norm": 1.6160080432891846, + "learning_rate": 1e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.8996742963790894, + "num_tokens": 450316575.0, + "step": 12360 + }, + { + "epoch": 2.2954503249767875, + "grad_norm": 1.7298439741134644, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8833683729171753, + "num_tokens": 450346942.0, + "step": 12361 + }, + { + "epoch": 2.295636025998143, + "grad_norm": 1.4335557222366333, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8854947090148926, + "num_tokens": 450391815.0, + "step": 12362 + }, + { + "epoch": 2.2958217270194985, + "grad_norm": 1.5602365732192993, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.868675947189331, + "num_tokens": 450432996.0, + "step": 12363 + }, + { + "epoch": 2.2960074280408542, + "grad_norm": 1.5784341096878052, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8924759030342102, + "num_tokens": 450469991.0, + "step": 12364 + }, + { + "epoch": 2.29619312906221, + "grad_norm": 1.5578975677490234, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8942188024520874, + "num_tokens": 450507897.0, + "step": 12365 + }, + { + "epoch": 2.2963788300835652, + "grad_norm": 1.7108935117721558, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8784271478652954, + "num_tokens": 450542576.0, + "step": 12366 + }, + { + "epoch": 2.296564531104921, + "grad_norm": 1.556243896484375, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8825430870056152, + "num_tokens": 450580233.0, + "step": 12367 + }, + { + "epoch": 2.2967502321262767, + "grad_norm": 1.4702750444412231, + "learning_rate": 1e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9016213417053223, + "num_tokens": 450619548.0, + "step": 12368 + }, + { + "epoch": 2.2969359331476324, + "grad_norm": 1.5959386825561523, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8888081312179565, + "num_tokens": 450653812.0, + "step": 12369 + }, + { + "epoch": 2.2971216341689877, + "grad_norm": 1.577441930770874, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8869826793670654, + "num_tokens": 450690301.0, + "step": 12370 + }, + { + "epoch": 2.2973073351903435, + "grad_norm": 1.5304776430130005, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8945176601409912, + "num_tokens": 450731277.0, + "step": 12371 + }, + { + "epoch": 2.297493036211699, + "grad_norm": 1.8175395727157593, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8927373886108398, + "num_tokens": 450763703.0, + "step": 12372 + }, + { + "epoch": 2.297678737233055, + "grad_norm": 1.630383014678955, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8914607763290405, + "num_tokens": 450800325.0, + "step": 12373 + }, + { + "epoch": 2.2978644382544102, + "grad_norm": 1.5092523097991943, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8866260647773743, + "num_tokens": 450840713.0, + "step": 12374 + }, + { + "epoch": 2.298050139275766, + "grad_norm": 1.6385186910629272, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8857402801513672, + "num_tokens": 450877807.0, + "step": 12375 + }, + { + "epoch": 2.2982358402971217, + "grad_norm": 1.5291192531585693, + "learning_rate": 1e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.8983432054519653, + "num_tokens": 450912471.0, + "step": 12376 + }, + { + "epoch": 2.2984215413184774, + "grad_norm": 1.5789663791656494, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8796363472938538, + "num_tokens": 450951976.0, + "step": 12377 + }, + { + "epoch": 2.2986072423398327, + "grad_norm": 1.52565336227417, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.898716926574707, + "num_tokens": 450988657.0, + "step": 12378 + }, + { + "epoch": 2.2987929433611884, + "grad_norm": 1.646399974822998, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8872894644737244, + "num_tokens": 451022767.0, + "step": 12379 + }, + { + "epoch": 2.298978644382544, + "grad_norm": 1.5878653526306152, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8737927675247192, + "num_tokens": 451064831.0, + "step": 12380 + }, + { + "epoch": 2.2991643454039, + "grad_norm": 1.4761326313018799, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.89127516746521, + "num_tokens": 451105354.0, + "step": 12381 + }, + { + "epoch": 2.299350046425255, + "grad_norm": 1.6124522686004639, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8828117251396179, + "num_tokens": 451142109.0, + "step": 12382 + }, + { + "epoch": 2.299535747446611, + "grad_norm": 1.5940629243850708, + "learning_rate": 1e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.901324450969696, + "num_tokens": 451175405.0, + "step": 12383 + }, + { + "epoch": 2.2997214484679667, + "grad_norm": 1.6505053043365479, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8780686855316162, + "num_tokens": 451210554.0, + "step": 12384 + }, + { + "epoch": 2.2999071494893224, + "grad_norm": 1.6533315181732178, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8760460615158081, + "num_tokens": 451246407.0, + "step": 12385 + }, + { + "epoch": 2.3000928505106777, + "grad_norm": 1.5726449489593506, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8771913051605225, + "num_tokens": 451287435.0, + "step": 12386 + }, + { + "epoch": 2.3002785515320334, + "grad_norm": 1.4386857748031616, + "learning_rate": 1e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.8990488052368164, + "num_tokens": 451323847.0, + "step": 12387 + }, + { + "epoch": 2.300464252553389, + "grad_norm": 1.8489465713500977, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8631414771080017, + "num_tokens": 451360988.0, + "step": 12388 + }, + { + "epoch": 2.3006499535747444, + "grad_norm": 1.7084519863128662, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8785110712051392, + "num_tokens": 451396580.0, + "step": 12389 + }, + { + "epoch": 2.3008356545961, + "grad_norm": 1.5923428535461426, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8879934549331665, + "num_tokens": 451430564.0, + "step": 12390 + }, + { + "epoch": 2.301021355617456, + "grad_norm": 1.700847864151001, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8837929964065552, + "num_tokens": 451464133.0, + "step": 12391 + }, + { + "epoch": 2.3012070566388116, + "grad_norm": 1.4585713148117065, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8942400217056274, + "num_tokens": 451504721.0, + "step": 12392 + }, + { + "epoch": 2.3013927576601674, + "grad_norm": 1.6551634073257446, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8886534571647644, + "num_tokens": 451537937.0, + "step": 12393 + }, + { + "epoch": 2.3015784586815227, + "grad_norm": 1.5636749267578125, + "learning_rate": 1e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.8995165824890137, + "num_tokens": 451574177.0, + "step": 12394 + }, + { + "epoch": 2.3017641597028784, + "grad_norm": 1.6759061813354492, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8832825422286987, + "num_tokens": 451606969.0, + "step": 12395 + }, + { + "epoch": 2.301949860724234, + "grad_norm": 1.6584535837173462, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8838861584663391, + "num_tokens": 451639571.0, + "step": 12396 + }, + { + "epoch": 2.3021355617455894, + "grad_norm": 1.6799765825271606, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8840980529785156, + "num_tokens": 451677503.0, + "step": 12397 + }, + { + "epoch": 2.302321262766945, + "grad_norm": 1.685853123664856, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.893645167350769, + "num_tokens": 451707579.0, + "step": 12398 + }, + { + "epoch": 2.302506963788301, + "grad_norm": 1.7038509845733643, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8950386047363281, + "num_tokens": 451737407.0, + "step": 12399 + }, + { + "epoch": 2.3026926648096566, + "grad_norm": 1.6862008571624756, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8932613730430603, + "num_tokens": 451767853.0, + "step": 12400 + }, + { + "epoch": 2.302878365831012, + "grad_norm": 1.5899040699005127, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.897477924823761, + "num_tokens": 451802384.0, + "step": 12401 + }, + { + "epoch": 2.3030640668523676, + "grad_norm": 1.663202166557312, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8872073292732239, + "num_tokens": 451835670.0, + "step": 12402 + }, + { + "epoch": 2.3032497678737234, + "grad_norm": 1.5490903854370117, + "learning_rate": 1e-06, + "loss": 0.2644, + "mean_token_accuracy": 0.9043213129043579, + "num_tokens": 451869514.0, + "step": 12403 + }, + { + "epoch": 2.303435468895079, + "grad_norm": 1.646375060081482, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8925102353096008, + "num_tokens": 451905205.0, + "step": 12404 + }, + { + "epoch": 2.3036211699164344, + "grad_norm": 1.605610728263855, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8780302405357361, + "num_tokens": 451942650.0, + "step": 12405 + }, + { + "epoch": 2.30380687093779, + "grad_norm": 1.6111170053482056, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8901678323745728, + "num_tokens": 451976939.0, + "step": 12406 + }, + { + "epoch": 2.303992571959146, + "grad_norm": 1.5683609247207642, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8866607546806335, + "num_tokens": 452015585.0, + "step": 12407 + }, + { + "epoch": 2.3041782729805016, + "grad_norm": 1.681530237197876, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8914975523948669, + "num_tokens": 452051397.0, + "step": 12408 + }, + { + "epoch": 2.304363974001857, + "grad_norm": 1.6336508989334106, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.880877673625946, + "num_tokens": 452089100.0, + "step": 12409 + }, + { + "epoch": 2.3045496750232126, + "grad_norm": 1.8127272129058838, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8871562480926514, + "num_tokens": 452120178.0, + "step": 12410 + }, + { + "epoch": 2.3047353760445684, + "grad_norm": 1.5569499731063843, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.89317786693573, + "num_tokens": 452158333.0, + "step": 12411 + }, + { + "epoch": 2.3049210770659236, + "grad_norm": 1.6332902908325195, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8650420904159546, + "num_tokens": 452198130.0, + "step": 12412 + }, + { + "epoch": 2.3051067780872794, + "grad_norm": 1.6396816968917847, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.886096715927124, + "num_tokens": 452238508.0, + "step": 12413 + }, + { + "epoch": 2.305292479108635, + "grad_norm": 1.5214406251907349, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8952912092208862, + "num_tokens": 452273529.0, + "step": 12414 + }, + { + "epoch": 2.305478180129991, + "grad_norm": 1.6925848722457886, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8897030353546143, + "num_tokens": 452309426.0, + "step": 12415 + }, + { + "epoch": 2.3056638811513466, + "grad_norm": 1.7167274951934814, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8885184526443481, + "num_tokens": 452342090.0, + "step": 12416 + }, + { + "epoch": 2.305849582172702, + "grad_norm": 1.505276083946228, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.894740104675293, + "num_tokens": 452379988.0, + "step": 12417 + }, + { + "epoch": 2.3060352831940576, + "grad_norm": 1.8514548540115356, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8921791315078735, + "num_tokens": 452409969.0, + "step": 12418 + }, + { + "epoch": 2.3062209842154133, + "grad_norm": 1.7173689603805542, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8889418840408325, + "num_tokens": 452450016.0, + "step": 12419 + }, + { + "epoch": 2.3064066852367686, + "grad_norm": 1.6874257326126099, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8824189901351929, + "num_tokens": 452482428.0, + "step": 12420 + }, + { + "epoch": 2.3065923862581243, + "grad_norm": 1.6606777906417847, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8871440887451172, + "num_tokens": 452517096.0, + "step": 12421 + }, + { + "epoch": 2.30677808727948, + "grad_norm": 1.6490099430084229, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8910089731216431, + "num_tokens": 452554241.0, + "step": 12422 + }, + { + "epoch": 2.306963788300836, + "grad_norm": 1.6365885734558105, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8930926322937012, + "num_tokens": 452592093.0, + "step": 12423 + }, + { + "epoch": 2.307149489322191, + "grad_norm": 1.5124619007110596, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.88228440284729, + "num_tokens": 452633331.0, + "step": 12424 + }, + { + "epoch": 2.307335190343547, + "grad_norm": 1.5835503339767456, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8937567472457886, + "num_tokens": 452667475.0, + "step": 12425 + }, + { + "epoch": 2.3075208913649026, + "grad_norm": 2.1347854137420654, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8617433905601501, + "num_tokens": 452697096.0, + "step": 12426 + }, + { + "epoch": 2.3077065923862583, + "grad_norm": 1.5909366607666016, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.893743634223938, + "num_tokens": 452731358.0, + "step": 12427 + }, + { + "epoch": 2.3078922934076136, + "grad_norm": 1.6597449779510498, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8862577676773071, + "num_tokens": 452766250.0, + "step": 12428 + }, + { + "epoch": 2.3080779944289693, + "grad_norm": 1.8624159097671509, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8775294423103333, + "num_tokens": 452796359.0, + "step": 12429 + }, + { + "epoch": 2.308263695450325, + "grad_norm": 1.681124210357666, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8713737726211548, + "num_tokens": 452834092.0, + "step": 12430 + }, + { + "epoch": 2.308449396471681, + "grad_norm": 1.6072195768356323, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8789424300193787, + "num_tokens": 452871455.0, + "step": 12431 + }, + { + "epoch": 2.308635097493036, + "grad_norm": 1.7804641723632812, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8795168399810791, + "num_tokens": 452901150.0, + "step": 12432 + }, + { + "epoch": 2.308820798514392, + "grad_norm": 1.5787038803100586, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8798280954360962, + "num_tokens": 452937873.0, + "step": 12433 + }, + { + "epoch": 2.3090064995357475, + "grad_norm": 1.648781180381775, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8992794752120972, + "num_tokens": 452969162.0, + "step": 12434 + }, + { + "epoch": 2.309192200557103, + "grad_norm": 1.671994686126709, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8844285011291504, + "num_tokens": 453007447.0, + "step": 12435 + }, + { + "epoch": 2.3093779015784586, + "grad_norm": 1.4221316576004028, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8943005204200745, + "num_tokens": 453052570.0, + "step": 12436 + }, + { + "epoch": 2.3095636025998143, + "grad_norm": 1.6688940525054932, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8760874271392822, + "num_tokens": 453089355.0, + "step": 12437 + }, + { + "epoch": 2.30974930362117, + "grad_norm": 1.5467208623886108, + "learning_rate": 1e-06, + "loss": 0.279, + "mean_token_accuracy": 0.8989613056182861, + "num_tokens": 453131082.0, + "step": 12438 + }, + { + "epoch": 2.3099350046425258, + "grad_norm": 1.6312716007232666, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8870781660079956, + "num_tokens": 453166618.0, + "step": 12439 + }, + { + "epoch": 2.310120705663881, + "grad_norm": 1.8098610639572144, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.88451087474823, + "num_tokens": 453201706.0, + "step": 12440 + }, + { + "epoch": 2.310306406685237, + "grad_norm": 1.5754457712173462, + "learning_rate": 1e-06, + "loss": 0.2619, + "mean_token_accuracy": 0.904434084892273, + "num_tokens": 453237661.0, + "step": 12441 + }, + { + "epoch": 2.3104921077065925, + "grad_norm": 1.632676362991333, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8724972605705261, + "num_tokens": 453273433.0, + "step": 12442 + }, + { + "epoch": 2.310677808727948, + "grad_norm": 1.5834085941314697, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8887872695922852, + "num_tokens": 453310145.0, + "step": 12443 + }, + { + "epoch": 2.3108635097493035, + "grad_norm": 1.6031773090362549, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8847950100898743, + "num_tokens": 453343870.0, + "step": 12444 + }, + { + "epoch": 2.3110492107706593, + "grad_norm": 1.7197104692459106, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8846895098686218, + "num_tokens": 453380002.0, + "step": 12445 + }, + { + "epoch": 2.311234911792015, + "grad_norm": 1.6227182149887085, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8893533945083618, + "num_tokens": 453416276.0, + "step": 12446 + }, + { + "epoch": 2.3114206128133703, + "grad_norm": 1.7711522579193115, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8904122114181519, + "num_tokens": 453446014.0, + "step": 12447 + }, + { + "epoch": 2.311606313834726, + "grad_norm": 1.7578380107879639, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8780373334884644, + "num_tokens": 453478573.0, + "step": 12448 + }, + { + "epoch": 2.3117920148560818, + "grad_norm": 1.723196029663086, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8844414353370667, + "num_tokens": 453512231.0, + "step": 12449 + }, + { + "epoch": 2.3119777158774375, + "grad_norm": 1.5699907541275024, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8687949180603027, + "num_tokens": 453553015.0, + "step": 12450 + }, + { + "epoch": 2.312163416898793, + "grad_norm": 1.6223856210708618, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8907819390296936, + "num_tokens": 453591348.0, + "step": 12451 + }, + { + "epoch": 2.3123491179201485, + "grad_norm": 1.7006630897521973, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8858750462532043, + "num_tokens": 453627447.0, + "step": 12452 + }, + { + "epoch": 2.3125348189415043, + "grad_norm": 1.7267091274261475, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8784307241439819, + "num_tokens": 453663044.0, + "step": 12453 + }, + { + "epoch": 2.31272051996286, + "grad_norm": 1.4816466569900513, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.8998281955718994, + "num_tokens": 453700862.0, + "step": 12454 + }, + { + "epoch": 2.3129062209842153, + "grad_norm": 1.8251137733459473, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8850032091140747, + "num_tokens": 453731867.0, + "step": 12455 + }, + { + "epoch": 2.313091922005571, + "grad_norm": 1.5315146446228027, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8949254751205444, + "num_tokens": 453770063.0, + "step": 12456 + }, + { + "epoch": 2.3132776230269267, + "grad_norm": 1.474632740020752, + "learning_rate": 1e-06, + "loss": 0.2698, + "mean_token_accuracy": 0.9007742404937744, + "num_tokens": 453808374.0, + "step": 12457 + }, + { + "epoch": 2.313463324048282, + "grad_norm": 1.6468271017074585, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8917092084884644, + "num_tokens": 453840797.0, + "step": 12458 + }, + { + "epoch": 2.3136490250696378, + "grad_norm": 1.6235660314559937, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8713680505752563, + "num_tokens": 453881623.0, + "step": 12459 + }, + { + "epoch": 2.3138347260909935, + "grad_norm": 1.7069178819656372, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8656604886054993, + "num_tokens": 453918220.0, + "step": 12460 + }, + { + "epoch": 2.3140204271123492, + "grad_norm": 1.534256100654602, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.89549320936203, + "num_tokens": 453954616.0, + "step": 12461 + }, + { + "epoch": 2.314206128133705, + "grad_norm": 1.6760993003845215, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.8989752531051636, + "num_tokens": 453988166.0, + "step": 12462 + }, + { + "epoch": 2.3143918291550603, + "grad_norm": 1.610098958015442, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.8970440626144409, + "num_tokens": 454021474.0, + "step": 12463 + }, + { + "epoch": 2.314577530176416, + "grad_norm": 1.6481797695159912, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8731778860092163, + "num_tokens": 454060183.0, + "step": 12464 + }, + { + "epoch": 2.3147632311977717, + "grad_norm": 1.551209807395935, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8812124133110046, + "num_tokens": 454100413.0, + "step": 12465 + }, + { + "epoch": 2.314948932219127, + "grad_norm": 1.5855717658996582, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8771500587463379, + "num_tokens": 454140339.0, + "step": 12466 + }, + { + "epoch": 2.3151346332404827, + "grad_norm": 1.7183749675750732, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8871711492538452, + "num_tokens": 454175007.0, + "step": 12467 + }, + { + "epoch": 2.3153203342618385, + "grad_norm": 1.5048311948776245, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8827999830245972, + "num_tokens": 454214928.0, + "step": 12468 + }, + { + "epoch": 2.315506035283194, + "grad_norm": 1.4870457649230957, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8754755258560181, + "num_tokens": 454258931.0, + "step": 12469 + }, + { + "epoch": 2.3156917363045495, + "grad_norm": 1.7427746057510376, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8804827332496643, + "num_tokens": 454292142.0, + "step": 12470 + }, + { + "epoch": 2.3158774373259052, + "grad_norm": 1.5769611597061157, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8891544342041016, + "num_tokens": 454332387.0, + "step": 12471 + }, + { + "epoch": 2.316063138347261, + "grad_norm": 1.4707008600234985, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8926898241043091, + "num_tokens": 454373184.0, + "step": 12472 + }, + { + "epoch": 2.3162488393686167, + "grad_norm": 1.5164806842803955, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8907801508903503, + "num_tokens": 454414439.0, + "step": 12473 + }, + { + "epoch": 2.316434540389972, + "grad_norm": 1.540220022201538, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.893316388130188, + "num_tokens": 454455883.0, + "step": 12474 + }, + { + "epoch": 2.3166202414113277, + "grad_norm": 1.601781964302063, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8829235434532166, + "num_tokens": 454493710.0, + "step": 12475 + }, + { + "epoch": 2.3168059424326835, + "grad_norm": 1.735975980758667, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8892214298248291, + "num_tokens": 454526549.0, + "step": 12476 + }, + { + "epoch": 2.316991643454039, + "grad_norm": 1.5134940147399902, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8773326873779297, + "num_tokens": 454571036.0, + "step": 12477 + }, + { + "epoch": 2.3171773444753945, + "grad_norm": 1.705379843711853, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8880701065063477, + "num_tokens": 454604370.0, + "step": 12478 + }, + { + "epoch": 2.31736304549675, + "grad_norm": 1.6063108444213867, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8808706998825073, + "num_tokens": 454641434.0, + "step": 12479 + }, + { + "epoch": 2.317548746518106, + "grad_norm": 1.781786561012268, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8844305872917175, + "num_tokens": 454671817.0, + "step": 12480 + }, + { + "epoch": 2.3177344475394612, + "grad_norm": 1.7342274188995361, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8690778017044067, + "num_tokens": 454704618.0, + "step": 12481 + }, + { + "epoch": 2.317920148560817, + "grad_norm": 1.7714428901672363, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8784987330436707, + "num_tokens": 454740326.0, + "step": 12482 + }, + { + "epoch": 2.3181058495821727, + "grad_norm": 1.5678791999816895, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8963116407394409, + "num_tokens": 454779217.0, + "step": 12483 + }, + { + "epoch": 2.3182915506035284, + "grad_norm": 1.6967051029205322, + "learning_rate": 1e-06, + "loss": 0.2683, + "mean_token_accuracy": 0.901266872882843, + "num_tokens": 454809785.0, + "step": 12484 + }, + { + "epoch": 2.318477251624884, + "grad_norm": 1.5397391319274902, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8822170495986938, + "num_tokens": 454849816.0, + "step": 12485 + }, + { + "epoch": 2.3186629526462395, + "grad_norm": 1.525005578994751, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8736198544502258, + "num_tokens": 454890863.0, + "step": 12486 + }, + { + "epoch": 2.318848653667595, + "grad_norm": 1.5718821287155151, + "learning_rate": 1e-06, + "loss": 0.2724, + "mean_token_accuracy": 0.9002455472946167, + "num_tokens": 454925072.0, + "step": 12487 + }, + { + "epoch": 2.319034354688951, + "grad_norm": 1.5217628479003906, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8843337297439575, + "num_tokens": 454965308.0, + "step": 12488 + }, + { + "epoch": 2.319220055710306, + "grad_norm": 1.6665229797363281, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8899089694023132, + "num_tokens": 454997303.0, + "step": 12489 + }, + { + "epoch": 2.319405756731662, + "grad_norm": 1.6204869747161865, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8934994339942932, + "num_tokens": 455030003.0, + "step": 12490 + }, + { + "epoch": 2.3195914577530177, + "grad_norm": 1.660957932472229, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8761323690414429, + "num_tokens": 455068838.0, + "step": 12491 + }, + { + "epoch": 2.3197771587743734, + "grad_norm": 1.5850087404251099, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8776702880859375, + "num_tokens": 455108945.0, + "step": 12492 + }, + { + "epoch": 2.3199628597957287, + "grad_norm": 1.5666354894638062, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8844059705734253, + "num_tokens": 455147255.0, + "step": 12493 + }, + { + "epoch": 2.3201485608170844, + "grad_norm": 1.7292885780334473, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8791139125823975, + "num_tokens": 455177051.0, + "step": 12494 + }, + { + "epoch": 2.32033426183844, + "grad_norm": 1.4973641633987427, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.894776463508606, + "num_tokens": 455216293.0, + "step": 12495 + }, + { + "epoch": 2.320519962859796, + "grad_norm": 1.5065463781356812, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8897969722747803, + "num_tokens": 455253563.0, + "step": 12496 + }, + { + "epoch": 2.320705663881151, + "grad_norm": 1.4228911399841309, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8821977972984314, + "num_tokens": 455297943.0, + "step": 12497 + }, + { + "epoch": 2.320891364902507, + "grad_norm": 1.574623465538025, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8800060749053955, + "num_tokens": 455338167.0, + "step": 12498 + }, + { + "epoch": 2.3210770659238626, + "grad_norm": 1.5466457605361938, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8803991079330444, + "num_tokens": 455377147.0, + "step": 12499 + }, + { + "epoch": 2.3212627669452184, + "grad_norm": 1.5894821882247925, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8912658095359802, + "num_tokens": 455410554.0, + "step": 12500 + }, + { + "epoch": 2.3214484679665737, + "grad_norm": 1.662278175354004, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8823646903038025, + "num_tokens": 455445991.0, + "step": 12501 + }, + { + "epoch": 2.3216341689879294, + "grad_norm": 1.5955402851104736, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8934226632118225, + "num_tokens": 455479292.0, + "step": 12502 + }, + { + "epoch": 2.321819870009285, + "grad_norm": 1.6975595951080322, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8734896183013916, + "num_tokens": 455514601.0, + "step": 12503 + }, + { + "epoch": 2.3220055710306404, + "grad_norm": 1.6497553586959839, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8964173793792725, + "num_tokens": 455550504.0, + "step": 12504 + }, + { + "epoch": 2.322191272051996, + "grad_norm": 1.5343308448791504, + "learning_rate": 1e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9013316035270691, + "num_tokens": 455587858.0, + "step": 12505 + }, + { + "epoch": 2.322376973073352, + "grad_norm": 1.566211223602295, + "learning_rate": 1e-06, + "loss": 0.2709, + "mean_token_accuracy": 0.8973077535629272, + "num_tokens": 455620735.0, + "step": 12506 + }, + { + "epoch": 2.3225626740947076, + "grad_norm": 1.536643385887146, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8921605348587036, + "num_tokens": 455659180.0, + "step": 12507 + }, + { + "epoch": 2.3227483751160634, + "grad_norm": 1.7259242534637451, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9020900726318359, + "num_tokens": 455689719.0, + "step": 12508 + }, + { + "epoch": 2.3229340761374186, + "grad_norm": 1.6678770780563354, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8827379941940308, + "num_tokens": 455724355.0, + "step": 12509 + }, + { + "epoch": 2.3231197771587744, + "grad_norm": 1.4390243291854858, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8936511278152466, + "num_tokens": 455765670.0, + "step": 12510 + }, + { + "epoch": 2.32330547818013, + "grad_norm": 1.7191272974014282, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8904702663421631, + "num_tokens": 455800340.0, + "step": 12511 + }, + { + "epoch": 2.3234911792014854, + "grad_norm": 1.6709994077682495, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8924492001533508, + "num_tokens": 455834066.0, + "step": 12512 + }, + { + "epoch": 2.323676880222841, + "grad_norm": 1.6469563245773315, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8910003900527954, + "num_tokens": 455866216.0, + "step": 12513 + }, + { + "epoch": 2.323862581244197, + "grad_norm": 1.813709020614624, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8722363710403442, + "num_tokens": 455900556.0, + "step": 12514 + }, + { + "epoch": 2.3240482822655526, + "grad_norm": 1.6456718444824219, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8959230184555054, + "num_tokens": 455935511.0, + "step": 12515 + }, + { + "epoch": 2.324233983286908, + "grad_norm": 1.7201346158981323, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8890072107315063, + "num_tokens": 455965861.0, + "step": 12516 + }, + { + "epoch": 2.3244196843082636, + "grad_norm": 1.496766209602356, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8884392976760864, + "num_tokens": 456008651.0, + "step": 12517 + }, + { + "epoch": 2.3246053853296194, + "grad_norm": 1.7326993942260742, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8840919733047485, + "num_tokens": 456046062.0, + "step": 12518 + }, + { + "epoch": 2.324791086350975, + "grad_norm": 1.5906389951705933, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8814069628715515, + "num_tokens": 456083728.0, + "step": 12519 + }, + { + "epoch": 2.3249767873723304, + "grad_norm": 1.6638215780258179, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8902556896209717, + "num_tokens": 456119457.0, + "step": 12520 + }, + { + "epoch": 2.325162488393686, + "grad_norm": 1.551221489906311, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8905312418937683, + "num_tokens": 456156377.0, + "step": 12521 + }, + { + "epoch": 2.325348189415042, + "grad_norm": 1.8504548072814941, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.883914589881897, + "num_tokens": 456189193.0, + "step": 12522 + }, + { + "epoch": 2.3255338904363976, + "grad_norm": 1.5291372537612915, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8840072154998779, + "num_tokens": 456231196.0, + "step": 12523 + }, + { + "epoch": 2.325719591457753, + "grad_norm": 1.5677303075790405, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8948981761932373, + "num_tokens": 456269510.0, + "step": 12524 + }, + { + "epoch": 2.3259052924791086, + "grad_norm": 1.6445564031600952, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8787201046943665, + "num_tokens": 456308415.0, + "step": 12525 + }, + { + "epoch": 2.3260909935004643, + "grad_norm": 1.6490832567214966, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8925033807754517, + "num_tokens": 456345092.0, + "step": 12526 + }, + { + "epoch": 2.3262766945218196, + "grad_norm": 1.6712899208068848, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.9007312059402466, + "num_tokens": 456375758.0, + "step": 12527 + }, + { + "epoch": 2.3264623955431754, + "grad_norm": 1.8776421546936035, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8927299976348877, + "num_tokens": 456411373.0, + "step": 12528 + }, + { + "epoch": 2.326648096564531, + "grad_norm": 1.5962179899215698, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8847639560699463, + "num_tokens": 456447205.0, + "step": 12529 + }, + { + "epoch": 2.326833797585887, + "grad_norm": 1.5208899974822998, + "learning_rate": 1e-06, + "loss": 0.2613, + "mean_token_accuracy": 0.903546154499054, + "num_tokens": 456481628.0, + "step": 12530 + }, + { + "epoch": 2.3270194986072426, + "grad_norm": 1.7633063793182373, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8720566034317017, + "num_tokens": 456517150.0, + "step": 12531 + }, + { + "epoch": 2.327205199628598, + "grad_norm": 1.6460864543914795, + "learning_rate": 1e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.899208664894104, + "num_tokens": 456548150.0, + "step": 12532 + }, + { + "epoch": 2.3273909006499536, + "grad_norm": 1.6951106786727905, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8966922760009766, + "num_tokens": 456581210.0, + "step": 12533 + }, + { + "epoch": 2.3275766016713093, + "grad_norm": 1.6148569583892822, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8937027454376221, + "num_tokens": 456619064.0, + "step": 12534 + }, + { + "epoch": 2.3277623026926646, + "grad_norm": 1.728570818901062, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8928027153015137, + "num_tokens": 456650392.0, + "step": 12535 + }, + { + "epoch": 2.3279480037140203, + "grad_norm": 1.712303638458252, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.891862154006958, + "num_tokens": 456683058.0, + "step": 12536 + }, + { + "epoch": 2.328133704735376, + "grad_norm": 1.5407758951187134, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8906345367431641, + "num_tokens": 456721200.0, + "step": 12537 + }, + { + "epoch": 2.328319405756732, + "grad_norm": 1.5357145071029663, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.9006217122077942, + "num_tokens": 456759036.0, + "step": 12538 + }, + { + "epoch": 2.328505106778087, + "grad_norm": 1.7422754764556885, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8671616911888123, + "num_tokens": 456793971.0, + "step": 12539 + }, + { + "epoch": 2.328690807799443, + "grad_norm": 1.7470394372940063, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8895724415779114, + "num_tokens": 456826643.0, + "step": 12540 + }, + { + "epoch": 2.3288765088207986, + "grad_norm": 1.5714179277420044, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8960355520248413, + "num_tokens": 456862477.0, + "step": 12541 + }, + { + "epoch": 2.3290622098421543, + "grad_norm": 1.5299110412597656, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8897542953491211, + "num_tokens": 456899078.0, + "step": 12542 + }, + { + "epoch": 2.3292479108635096, + "grad_norm": 1.515131950378418, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8774204254150391, + "num_tokens": 456937530.0, + "step": 12543 + }, + { + "epoch": 2.3294336118848653, + "grad_norm": 1.4892445802688599, + "learning_rate": 1e-06, + "loss": 0.2796, + "mean_token_accuracy": 0.896125078201294, + "num_tokens": 456973598.0, + "step": 12544 + }, + { + "epoch": 2.329619312906221, + "grad_norm": 1.6702193021774292, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8838530778884888, + "num_tokens": 457010904.0, + "step": 12545 + }, + { + "epoch": 2.3298050139275768, + "grad_norm": 1.619769811630249, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8755785226821899, + "num_tokens": 457046902.0, + "step": 12546 + }, + { + "epoch": 2.329990714948932, + "grad_norm": 1.5445585250854492, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8871414065361023, + "num_tokens": 457085833.0, + "step": 12547 + }, + { + "epoch": 2.330176415970288, + "grad_norm": 1.6167830228805542, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8855164647102356, + "num_tokens": 457119972.0, + "step": 12548 + }, + { + "epoch": 2.3303621169916435, + "grad_norm": 1.7057850360870361, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8780885934829712, + "num_tokens": 457157210.0, + "step": 12549 + }, + { + "epoch": 2.3305478180129993, + "grad_norm": 1.6087723970413208, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8745386600494385, + "num_tokens": 457195360.0, + "step": 12550 + }, + { + "epoch": 2.3307335190343546, + "grad_norm": 1.5216809511184692, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8875269293785095, + "num_tokens": 457235463.0, + "step": 12551 + }, + { + "epoch": 2.3309192200557103, + "grad_norm": 1.5526670217514038, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8864983916282654, + "num_tokens": 457273759.0, + "step": 12552 + }, + { + "epoch": 2.331104921077066, + "grad_norm": 1.414680004119873, + "learning_rate": 1e-06, + "loss": 0.2699, + "mean_token_accuracy": 0.9024155735969543, + "num_tokens": 457312552.0, + "step": 12553 + }, + { + "epoch": 2.3312906220984217, + "grad_norm": 1.6439019441604614, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8770416378974915, + "num_tokens": 457348478.0, + "step": 12554 + }, + { + "epoch": 2.331476323119777, + "grad_norm": 1.5784622430801392, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.8993205428123474, + "num_tokens": 457385325.0, + "step": 12555 + }, + { + "epoch": 2.3316620241411328, + "grad_norm": 1.6805294752120972, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8843396902084351, + "num_tokens": 457423752.0, + "step": 12556 + }, + { + "epoch": 2.3318477251624885, + "grad_norm": 1.6772964000701904, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8950400352478027, + "num_tokens": 457458394.0, + "step": 12557 + }, + { + "epoch": 2.332033426183844, + "grad_norm": 1.7372967004776, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8941276669502258, + "num_tokens": 457489132.0, + "step": 12558 + }, + { + "epoch": 2.3322191272051995, + "grad_norm": 1.496623158454895, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8842003345489502, + "num_tokens": 457528336.0, + "step": 12559 + }, + { + "epoch": 2.3324048282265553, + "grad_norm": 1.6125446557998657, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8755377531051636, + "num_tokens": 457569206.0, + "step": 12560 + }, + { + "epoch": 2.332590529247911, + "grad_norm": 1.5405726432800293, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8944648504257202, + "num_tokens": 457607421.0, + "step": 12561 + }, + { + "epoch": 2.3327762302692667, + "grad_norm": 1.5985575914382935, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8828874230384827, + "num_tokens": 457644888.0, + "step": 12562 + }, + { + "epoch": 2.332961931290622, + "grad_norm": 1.5511317253112793, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8755230903625488, + "num_tokens": 457684690.0, + "step": 12563 + }, + { + "epoch": 2.3331476323119777, + "grad_norm": 1.5081098079681396, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8934605717658997, + "num_tokens": 457726255.0, + "step": 12564 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.632230281829834, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.877404510974884, + "num_tokens": 457762178.0, + "step": 12565 + }, + { + "epoch": 2.3335190343546888, + "grad_norm": 1.6485682725906372, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8949092626571655, + "num_tokens": 457792749.0, + "step": 12566 + }, + { + "epoch": 2.3337047353760445, + "grad_norm": 1.7919145822525024, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8807698488235474, + "num_tokens": 457821997.0, + "step": 12567 + }, + { + "epoch": 2.3338904363974002, + "grad_norm": 1.8356881141662598, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8838119506835938, + "num_tokens": 457852778.0, + "step": 12568 + }, + { + "epoch": 2.334076137418756, + "grad_norm": 1.6689127683639526, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8902232646942139, + "num_tokens": 457884606.0, + "step": 12569 + }, + { + "epoch": 2.3342618384401113, + "grad_norm": 1.6217777729034424, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8766614198684692, + "num_tokens": 457922152.0, + "step": 12570 + }, + { + "epoch": 2.334447539461467, + "grad_norm": 1.660104513168335, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8948806524276733, + "num_tokens": 457955555.0, + "step": 12571 + }, + { + "epoch": 2.3346332404828227, + "grad_norm": 1.924585223197937, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8736801147460938, + "num_tokens": 457989594.0, + "step": 12572 + }, + { + "epoch": 2.3348189415041785, + "grad_norm": 1.877571940422058, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8793478012084961, + "num_tokens": 458019786.0, + "step": 12573 + }, + { + "epoch": 2.3350046425255337, + "grad_norm": 1.7083408832550049, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8802931308746338, + "num_tokens": 458053902.0, + "step": 12574 + }, + { + "epoch": 2.3351903435468895, + "grad_norm": 1.717397928237915, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8788117170333862, + "num_tokens": 458090017.0, + "step": 12575 + }, + { + "epoch": 2.335376044568245, + "grad_norm": 1.6727478504180908, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8913758993148804, + "num_tokens": 458125184.0, + "step": 12576 + }, + { + "epoch": 2.335561745589601, + "grad_norm": 1.6030550003051758, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8900326490402222, + "num_tokens": 458161301.0, + "step": 12577 + }, + { + "epoch": 2.3357474466109562, + "grad_norm": 1.7142128944396973, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8845182657241821, + "num_tokens": 458197984.0, + "step": 12578 + }, + { + "epoch": 2.335933147632312, + "grad_norm": 1.6746563911437988, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8750168681144714, + "num_tokens": 458230817.0, + "step": 12579 + }, + { + "epoch": 2.3361188486536677, + "grad_norm": 1.5251551866531372, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8988158106803894, + "num_tokens": 458265705.0, + "step": 12580 + }, + { + "epoch": 2.336304549675023, + "grad_norm": 1.768425464630127, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8733123540878296, + "num_tokens": 458302754.0, + "step": 12581 + }, + { + "epoch": 2.3364902506963787, + "grad_norm": 1.6551114320755005, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8906111717224121, + "num_tokens": 458337827.0, + "step": 12582 + }, + { + "epoch": 2.3366759517177345, + "grad_norm": 1.6504088640213013, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8625863194465637, + "num_tokens": 458377315.0, + "step": 12583 + }, + { + "epoch": 2.33686165273909, + "grad_norm": 1.4767413139343262, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.8998469710350037, + "num_tokens": 458416649.0, + "step": 12584 + }, + { + "epoch": 2.337047353760446, + "grad_norm": 1.5356863737106323, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8807992339134216, + "num_tokens": 458461586.0, + "step": 12585 + }, + { + "epoch": 2.337233054781801, + "grad_norm": 1.7427130937576294, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8819035887718201, + "num_tokens": 458495190.0, + "step": 12586 + }, + { + "epoch": 2.337418755803157, + "grad_norm": 1.904510736465454, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8851845264434814, + "num_tokens": 458527314.0, + "step": 12587 + }, + { + "epoch": 2.3376044568245127, + "grad_norm": 1.7130600214004517, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.893141508102417, + "num_tokens": 458562260.0, + "step": 12588 + }, + { + "epoch": 2.337790157845868, + "grad_norm": 1.7380969524383545, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8797841668128967, + "num_tokens": 458597853.0, + "step": 12589 + }, + { + "epoch": 2.3379758588672237, + "grad_norm": 1.6190502643585205, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8885588645935059, + "num_tokens": 458633992.0, + "step": 12590 + }, + { + "epoch": 2.3381615598885794, + "grad_norm": 1.4188776016235352, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8976619839668274, + "num_tokens": 458677757.0, + "step": 12591 + }, + { + "epoch": 2.338347260909935, + "grad_norm": 1.7226439714431763, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8676012754440308, + "num_tokens": 458716779.0, + "step": 12592 + }, + { + "epoch": 2.3385329619312905, + "grad_norm": 1.4890820980072021, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.881388783454895, + "num_tokens": 458761161.0, + "step": 12593 + }, + { + "epoch": 2.338718662952646, + "grad_norm": 1.6708768606185913, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8885602355003357, + "num_tokens": 458793714.0, + "step": 12594 + }, + { + "epoch": 2.338904363974002, + "grad_norm": 1.6274234056472778, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8747268915176392, + "num_tokens": 458829846.0, + "step": 12595 + }, + { + "epoch": 2.3390900649953577, + "grad_norm": 1.6334493160247803, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8899624347686768, + "num_tokens": 458864424.0, + "step": 12596 + }, + { + "epoch": 2.339275766016713, + "grad_norm": 1.6016414165496826, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8925395011901855, + "num_tokens": 458897819.0, + "step": 12597 + }, + { + "epoch": 2.3394614670380687, + "grad_norm": 1.6719797849655151, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8877435922622681, + "num_tokens": 458930627.0, + "step": 12598 + }, + { + "epoch": 2.3396471680594244, + "grad_norm": 1.6644611358642578, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.892659068107605, + "num_tokens": 458961677.0, + "step": 12599 + }, + { + "epoch": 2.33983286908078, + "grad_norm": 1.8600572347640991, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8892698287963867, + "num_tokens": 458990014.0, + "step": 12600 + }, + { + "epoch": 2.3400185701021354, + "grad_norm": 1.5288488864898682, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8821534514427185, + "num_tokens": 459032719.0, + "step": 12601 + }, + { + "epoch": 2.340204271123491, + "grad_norm": 1.6085281372070312, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8923155665397644, + "num_tokens": 459067974.0, + "step": 12602 + }, + { + "epoch": 2.340389972144847, + "grad_norm": 1.43679678440094, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8874257802963257, + "num_tokens": 459112506.0, + "step": 12603 + }, + { + "epoch": 2.340575673166202, + "grad_norm": 1.5316994190216064, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8892894983291626, + "num_tokens": 459152336.0, + "step": 12604 + }, + { + "epoch": 2.340761374187558, + "grad_norm": 1.6148449182510376, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8907028436660767, + "num_tokens": 459185821.0, + "step": 12605 + }, + { + "epoch": 2.3409470752089137, + "grad_norm": 1.6686046123504639, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8913998007774353, + "num_tokens": 459218934.0, + "step": 12606 + }, + { + "epoch": 2.3411327762302694, + "grad_norm": 1.6395971775054932, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8761719465255737, + "num_tokens": 459255873.0, + "step": 12607 + }, + { + "epoch": 2.341318477251625, + "grad_norm": 1.523662805557251, + "learning_rate": 1e-06, + "loss": 0.2743, + "mean_token_accuracy": 0.902717113494873, + "num_tokens": 459297616.0, + "step": 12608 + }, + { + "epoch": 2.3415041782729804, + "grad_norm": 1.505839228630066, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8952692747116089, + "num_tokens": 459343819.0, + "step": 12609 + }, + { + "epoch": 2.341689879294336, + "grad_norm": 1.7439554929733276, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8947892189025879, + "num_tokens": 459376201.0, + "step": 12610 + }, + { + "epoch": 2.341875580315692, + "grad_norm": 1.6214890480041504, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8899150490760803, + "num_tokens": 459411966.0, + "step": 12611 + }, + { + "epoch": 2.342061281337047, + "grad_norm": 1.646969199180603, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.890687882900238, + "num_tokens": 459447865.0, + "step": 12612 + }, + { + "epoch": 2.342246982358403, + "grad_norm": 1.5455775260925293, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8876849412918091, + "num_tokens": 459488448.0, + "step": 12613 + }, + { + "epoch": 2.3424326833797586, + "grad_norm": 1.5931977033615112, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8817410469055176, + "num_tokens": 459526997.0, + "step": 12614 + }, + { + "epoch": 2.3426183844011144, + "grad_norm": 1.6840119361877441, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8827198147773743, + "num_tokens": 459565745.0, + "step": 12615 + }, + { + "epoch": 2.3428040854224697, + "grad_norm": 1.6186890602111816, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8952680230140686, + "num_tokens": 459604496.0, + "step": 12616 + }, + { + "epoch": 2.3429897864438254, + "grad_norm": 1.5695514678955078, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8915460109710693, + "num_tokens": 459641747.0, + "step": 12617 + }, + { + "epoch": 2.343175487465181, + "grad_norm": 1.5579472780227661, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8888378143310547, + "num_tokens": 459681241.0, + "step": 12618 + }, + { + "epoch": 2.343361188486537, + "grad_norm": 1.923370361328125, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8954296112060547, + "num_tokens": 459711911.0, + "step": 12619 + }, + { + "epoch": 2.343546889507892, + "grad_norm": 1.6866627931594849, + "learning_rate": 1e-06, + "loss": 0.2686, + "mean_token_accuracy": 0.9006305932998657, + "num_tokens": 459745784.0, + "step": 12620 + }, + { + "epoch": 2.343732590529248, + "grad_norm": 1.6668883562088013, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8807536959648132, + "num_tokens": 459782665.0, + "step": 12621 + }, + { + "epoch": 2.3439182915506036, + "grad_norm": 1.7315374612808228, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8972350358963013, + "num_tokens": 459813012.0, + "step": 12622 + }, + { + "epoch": 2.3441039925719593, + "grad_norm": 1.4366308450698853, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8971536159515381, + "num_tokens": 459853850.0, + "step": 12623 + }, + { + "epoch": 2.3442896935933146, + "grad_norm": 1.6274867057800293, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8914855122566223, + "num_tokens": 459890293.0, + "step": 12624 + }, + { + "epoch": 2.3444753946146704, + "grad_norm": 1.6472680568695068, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8740134239196777, + "num_tokens": 459925311.0, + "step": 12625 + }, + { + "epoch": 2.344661095636026, + "grad_norm": 1.5621739625930786, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8888545036315918, + "num_tokens": 459961872.0, + "step": 12626 + }, + { + "epoch": 2.3448467966573814, + "grad_norm": 1.6689738035202026, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8930971026420593, + "num_tokens": 459995421.0, + "step": 12627 + }, + { + "epoch": 2.345032497678737, + "grad_norm": 1.4762589931488037, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8896489143371582, + "num_tokens": 460039053.0, + "step": 12628 + }, + { + "epoch": 2.345218198700093, + "grad_norm": 1.4316625595092773, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.891968846321106, + "num_tokens": 460081773.0, + "step": 12629 + }, + { + "epoch": 2.3454038997214486, + "grad_norm": 1.6842576265335083, + "learning_rate": 1e-06, + "loss": 0.2706, + "mean_token_accuracy": 0.9034228920936584, + "num_tokens": 460111180.0, + "step": 12630 + }, + { + "epoch": 2.3455896007428043, + "grad_norm": 1.5403872728347778, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8843677043914795, + "num_tokens": 460150912.0, + "step": 12631 + }, + { + "epoch": 2.3457753017641596, + "grad_norm": 1.6247960329055786, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.892701268196106, + "num_tokens": 460184991.0, + "step": 12632 + }, + { + "epoch": 2.3459610027855153, + "grad_norm": 1.52908194065094, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8976138830184937, + "num_tokens": 460221233.0, + "step": 12633 + }, + { + "epoch": 2.346146703806871, + "grad_norm": 1.686937928199768, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8882941007614136, + "num_tokens": 460254515.0, + "step": 12634 + }, + { + "epoch": 2.3463324048282264, + "grad_norm": 1.4189622402191162, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8858875632286072, + "num_tokens": 460299290.0, + "step": 12635 + }, + { + "epoch": 2.346518105849582, + "grad_norm": 1.5763508081436157, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8764975070953369, + "num_tokens": 460339592.0, + "step": 12636 + }, + { + "epoch": 2.346703806870938, + "grad_norm": 1.5032823085784912, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8962876200675964, + "num_tokens": 460377538.0, + "step": 12637 + }, + { + "epoch": 2.3468895078922936, + "grad_norm": 1.6578495502471924, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8655359745025635, + "num_tokens": 460415503.0, + "step": 12638 + }, + { + "epoch": 2.347075208913649, + "grad_norm": 1.5765719413757324, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8869690895080566, + "num_tokens": 460449726.0, + "step": 12639 + }, + { + "epoch": 2.3472609099350046, + "grad_norm": 1.6310843229293823, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8851876258850098, + "num_tokens": 460483118.0, + "step": 12640 + }, + { + "epoch": 2.3474466109563603, + "grad_norm": 1.7046856880187988, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8762834668159485, + "num_tokens": 460518439.0, + "step": 12641 + }, + { + "epoch": 2.347632311977716, + "grad_norm": 1.5169342756271362, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8925651907920837, + "num_tokens": 460557436.0, + "step": 12642 + }, + { + "epoch": 2.3478180129990713, + "grad_norm": 1.478753924369812, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8799351453781128, + "num_tokens": 460601129.0, + "step": 12643 + }, + { + "epoch": 2.348003714020427, + "grad_norm": 1.44040048122406, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.8989925384521484, + "num_tokens": 460639127.0, + "step": 12644 + }, + { + "epoch": 2.348189415041783, + "grad_norm": 1.6429409980773926, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8807942867279053, + "num_tokens": 460677056.0, + "step": 12645 + }, + { + "epoch": 2.3483751160631385, + "grad_norm": 1.6182360649108887, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8892114758491516, + "num_tokens": 460712725.0, + "step": 12646 + }, + { + "epoch": 2.348560817084494, + "grad_norm": 1.7472572326660156, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8764288425445557, + "num_tokens": 460749103.0, + "step": 12647 + }, + { + "epoch": 2.3487465181058496, + "grad_norm": 1.6013262271881104, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8930842876434326, + "num_tokens": 460784282.0, + "step": 12648 + }, + { + "epoch": 2.3489322191272053, + "grad_norm": 1.8909279108047485, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8762469291687012, + "num_tokens": 460811878.0, + "step": 12649 + }, + { + "epoch": 2.3491179201485606, + "grad_norm": 1.7770817279815674, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.889826238155365, + "num_tokens": 460844762.0, + "step": 12650 + }, + { + "epoch": 2.3493036211699163, + "grad_norm": 1.6158998012542725, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.881743311882019, + "num_tokens": 460880783.0, + "step": 12651 + }, + { + "epoch": 2.349489322191272, + "grad_norm": 1.543797492980957, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8866568803787231, + "num_tokens": 460921843.0, + "step": 12652 + }, + { + "epoch": 2.349675023212628, + "grad_norm": 1.6008424758911133, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8965691328048706, + "num_tokens": 460956587.0, + "step": 12653 + }, + { + "epoch": 2.3498607242339835, + "grad_norm": 1.7088862657546997, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8866837024688721, + "num_tokens": 460993928.0, + "step": 12654 + }, + { + "epoch": 2.350046425255339, + "grad_norm": 1.4800333976745605, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8938776254653931, + "num_tokens": 461036500.0, + "step": 12655 + }, + { + "epoch": 2.3502321262766945, + "grad_norm": 1.8202015161514282, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8889085650444031, + "num_tokens": 461066352.0, + "step": 12656 + }, + { + "epoch": 2.3504178272980503, + "grad_norm": 1.7276463508605957, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8851069211959839, + "num_tokens": 461098319.0, + "step": 12657 + }, + { + "epoch": 2.3506035283194056, + "grad_norm": 1.6288933753967285, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8862144351005554, + "num_tokens": 461135779.0, + "step": 12658 + }, + { + "epoch": 2.3507892293407613, + "grad_norm": 1.4237557649612427, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8864991068840027, + "num_tokens": 461178331.0, + "step": 12659 + }, + { + "epoch": 2.350974930362117, + "grad_norm": 1.5772842168807983, + "learning_rate": 1e-06, + "loss": 0.2831, + "mean_token_accuracy": 0.8968578577041626, + "num_tokens": 461213910.0, + "step": 12660 + }, + { + "epoch": 2.3511606313834728, + "grad_norm": 1.742889404296875, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8654433488845825, + "num_tokens": 461249759.0, + "step": 12661 + }, + { + "epoch": 2.351346332404828, + "grad_norm": 1.4988161325454712, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8909783363342285, + "num_tokens": 461291795.0, + "step": 12662 + }, + { + "epoch": 2.3515320334261838, + "grad_norm": 1.548670768737793, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8736657500267029, + "num_tokens": 461335034.0, + "step": 12663 + }, + { + "epoch": 2.3517177344475395, + "grad_norm": 1.7157914638519287, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8813697695732117, + "num_tokens": 461369420.0, + "step": 12664 + }, + { + "epoch": 2.3519034354688952, + "grad_norm": 1.6616019010543823, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8901858925819397, + "num_tokens": 461406703.0, + "step": 12665 + }, + { + "epoch": 2.3520891364902505, + "grad_norm": 1.441584587097168, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.895549476146698, + "num_tokens": 461449766.0, + "step": 12666 + }, + { + "epoch": 2.3522748375116063, + "grad_norm": 1.8045283555984497, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8892953395843506, + "num_tokens": 461482518.0, + "step": 12667 + }, + { + "epoch": 2.352460538532962, + "grad_norm": 1.510251760482788, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8948578834533691, + "num_tokens": 461522982.0, + "step": 12668 + }, + { + "epoch": 2.3526462395543177, + "grad_norm": 1.725080966949463, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8667319416999817, + "num_tokens": 461559610.0, + "step": 12669 + }, + { + "epoch": 2.352831940575673, + "grad_norm": 1.6290875673294067, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8938152194023132, + "num_tokens": 461595285.0, + "step": 12670 + }, + { + "epoch": 2.3530176415970288, + "grad_norm": 1.5837953090667725, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8926042318344116, + "num_tokens": 461630686.0, + "step": 12671 + }, + { + "epoch": 2.3532033426183845, + "grad_norm": 1.6939899921417236, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8755441904067993, + "num_tokens": 461667316.0, + "step": 12672 + }, + { + "epoch": 2.3533890436397398, + "grad_norm": 1.4627344608306885, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8861207962036133, + "num_tokens": 461707806.0, + "step": 12673 + }, + { + "epoch": 2.3535747446610955, + "grad_norm": 1.536761999130249, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8876531720161438, + "num_tokens": 461744999.0, + "step": 12674 + }, + { + "epoch": 2.3537604456824512, + "grad_norm": 1.5652616024017334, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8878700137138367, + "num_tokens": 461788601.0, + "step": 12675 + }, + { + "epoch": 2.353946146703807, + "grad_norm": 1.8414815664291382, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8648334741592407, + "num_tokens": 461820885.0, + "step": 12676 + }, + { + "epoch": 2.3541318477251627, + "grad_norm": 1.646807312965393, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8885307312011719, + "num_tokens": 461859708.0, + "step": 12677 + }, + { + "epoch": 2.354317548746518, + "grad_norm": 1.614885687828064, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.880073070526123, + "num_tokens": 461897948.0, + "step": 12678 + }, + { + "epoch": 2.3545032497678737, + "grad_norm": 1.7086372375488281, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8793724775314331, + "num_tokens": 461929913.0, + "step": 12679 + }, + { + "epoch": 2.3546889507892295, + "grad_norm": 1.6040114164352417, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8963461518287659, + "num_tokens": 461962619.0, + "step": 12680 + }, + { + "epoch": 2.3548746518105848, + "grad_norm": 1.734105110168457, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8783642053604126, + "num_tokens": 461997545.0, + "step": 12681 + }, + { + "epoch": 2.3550603528319405, + "grad_norm": 1.4836914539337158, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8937640190124512, + "num_tokens": 462038040.0, + "step": 12682 + }, + { + "epoch": 2.355246053853296, + "grad_norm": 1.7592461109161377, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8804698586463928, + "num_tokens": 462069359.0, + "step": 12683 + }, + { + "epoch": 2.355431754874652, + "grad_norm": 1.5567363500595093, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8778076171875, + "num_tokens": 462108811.0, + "step": 12684 + }, + { + "epoch": 2.3556174558960072, + "grad_norm": 1.611579179763794, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.892875611782074, + "num_tokens": 462144005.0, + "step": 12685 + }, + { + "epoch": 2.355803156917363, + "grad_norm": 1.6042816638946533, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8841566443443298, + "num_tokens": 462180298.0, + "step": 12686 + }, + { + "epoch": 2.3559888579387187, + "grad_norm": 1.610718846321106, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8955057263374329, + "num_tokens": 462218746.0, + "step": 12687 + }, + { + "epoch": 2.3561745589600744, + "grad_norm": 1.6656006574630737, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8746215105056763, + "num_tokens": 462257879.0, + "step": 12688 + }, + { + "epoch": 2.3563602599814297, + "grad_norm": 1.775754451751709, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8745747804641724, + "num_tokens": 462291715.0, + "step": 12689 + }, + { + "epoch": 2.3565459610027855, + "grad_norm": 1.5828373432159424, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8923397660255432, + "num_tokens": 462329655.0, + "step": 12690 + }, + { + "epoch": 2.356731662024141, + "grad_norm": 1.652614951133728, + "learning_rate": 1e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.9023603200912476, + "num_tokens": 462362774.0, + "step": 12691 + }, + { + "epoch": 2.356917363045497, + "grad_norm": 1.6898555755615234, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8865987658500671, + "num_tokens": 462395436.0, + "step": 12692 + }, + { + "epoch": 2.357103064066852, + "grad_norm": 1.5404949188232422, + "learning_rate": 1e-06, + "loss": 0.2774, + "mean_token_accuracy": 0.8999190330505371, + "num_tokens": 462430798.0, + "step": 12693 + }, + { + "epoch": 2.357288765088208, + "grad_norm": 1.8456264734268188, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8745607137680054, + "num_tokens": 462461094.0, + "step": 12694 + }, + { + "epoch": 2.3574744661095637, + "grad_norm": 1.6595585346221924, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8749972581863403, + "num_tokens": 462498157.0, + "step": 12695 + }, + { + "epoch": 2.357660167130919, + "grad_norm": 1.592612624168396, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8885687589645386, + "num_tokens": 462535947.0, + "step": 12696 + }, + { + "epoch": 2.3578458681522747, + "grad_norm": 1.6610946655273438, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8840458393096924, + "num_tokens": 462570410.0, + "step": 12697 + }, + { + "epoch": 2.3580315691736304, + "grad_norm": 1.5329158306121826, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8779586553573608, + "num_tokens": 462611785.0, + "step": 12698 + }, + { + "epoch": 2.358217270194986, + "grad_norm": 1.5806455612182617, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8896532654762268, + "num_tokens": 462650934.0, + "step": 12699 + }, + { + "epoch": 2.358402971216342, + "grad_norm": 1.766917109489441, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8819377422332764, + "num_tokens": 462682569.0, + "step": 12700 + }, + { + "epoch": 2.358588672237697, + "grad_norm": 1.789912223815918, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.869136393070221, + "num_tokens": 462721696.0, + "step": 12701 + }, + { + "epoch": 2.358774373259053, + "grad_norm": 1.462906002998352, + "learning_rate": 1e-06, + "loss": 0.2763, + "mean_token_accuracy": 0.9007159471511841, + "num_tokens": 462762548.0, + "step": 12702 + }, + { + "epoch": 2.3589600742804087, + "grad_norm": 1.635494351387024, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8860131502151489, + "num_tokens": 462798072.0, + "step": 12703 + }, + { + "epoch": 2.359145775301764, + "grad_norm": 1.5361733436584473, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8989416360855103, + "num_tokens": 462835219.0, + "step": 12704 + }, + { + "epoch": 2.3593314763231197, + "grad_norm": 1.7093653678894043, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8874815702438354, + "num_tokens": 462869487.0, + "step": 12705 + }, + { + "epoch": 2.3595171773444754, + "grad_norm": 1.5747809410095215, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8953101634979248, + "num_tokens": 462906538.0, + "step": 12706 + }, + { + "epoch": 2.359702878365831, + "grad_norm": 1.6704635620117188, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8808795213699341, + "num_tokens": 462944627.0, + "step": 12707 + }, + { + "epoch": 2.3598885793871864, + "grad_norm": 1.4379903078079224, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8965946435928345, + "num_tokens": 462986414.0, + "step": 12708 + }, + { + "epoch": 2.360074280408542, + "grad_norm": 1.5677287578582764, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8918646574020386, + "num_tokens": 463024428.0, + "step": 12709 + }, + { + "epoch": 2.360259981429898, + "grad_norm": 1.640512228012085, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8886438608169556, + "num_tokens": 463059023.0, + "step": 12710 + }, + { + "epoch": 2.3604456824512536, + "grad_norm": 1.78408944606781, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8899255990982056, + "num_tokens": 463100739.0, + "step": 12711 + }, + { + "epoch": 2.360631383472609, + "grad_norm": 1.481611967086792, + "learning_rate": 1e-06, + "loss": 0.2524, + "mean_token_accuracy": 0.9049590826034546, + "num_tokens": 463135077.0, + "step": 12712 + }, + { + "epoch": 2.3608170844939647, + "grad_norm": 1.8157740831375122, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8835625648498535, + "num_tokens": 463165670.0, + "step": 12713 + }, + { + "epoch": 2.3610027855153204, + "grad_norm": 1.5630017518997192, + "learning_rate": 1e-06, + "loss": 0.2686, + "mean_token_accuracy": 0.9039487838745117, + "num_tokens": 463198389.0, + "step": 12714 + }, + { + "epoch": 2.361188486536676, + "grad_norm": 1.65413236618042, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8814188838005066, + "num_tokens": 463234951.0, + "step": 12715 + }, + { + "epoch": 2.3613741875580314, + "grad_norm": 1.5916458368301392, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.891274094581604, + "num_tokens": 463269199.0, + "step": 12716 + }, + { + "epoch": 2.361559888579387, + "grad_norm": 1.516916275024414, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8967301249504089, + "num_tokens": 463308888.0, + "step": 12717 + }, + { + "epoch": 2.361745589600743, + "grad_norm": 1.7267301082611084, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8837418556213379, + "num_tokens": 463342807.0, + "step": 12718 + }, + { + "epoch": 2.3619312906220986, + "grad_norm": 1.6072643995285034, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8903820514678955, + "num_tokens": 463382224.0, + "step": 12719 + }, + { + "epoch": 2.362116991643454, + "grad_norm": 1.6537028551101685, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8813401460647583, + "num_tokens": 463420663.0, + "step": 12720 + }, + { + "epoch": 2.3623026926648096, + "grad_norm": 1.6041003465652466, + "learning_rate": 1e-06, + "loss": 0.2668, + "mean_token_accuracy": 0.9008511900901794, + "num_tokens": 463456510.0, + "step": 12721 + }, + { + "epoch": 2.3624883936861654, + "grad_norm": 1.762326717376709, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8933264017105103, + "num_tokens": 463485352.0, + "step": 12722 + }, + { + "epoch": 2.362674094707521, + "grad_norm": 1.647803544998169, + "learning_rate": 1e-06, + "loss": 0.2691, + "mean_token_accuracy": 0.898285448551178, + "num_tokens": 463517947.0, + "step": 12723 + }, + { + "epoch": 2.3628597957288764, + "grad_norm": 1.5845235586166382, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8849271535873413, + "num_tokens": 463558081.0, + "step": 12724 + }, + { + "epoch": 2.363045496750232, + "grad_norm": 1.6169426441192627, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8878984451293945, + "num_tokens": 463595737.0, + "step": 12725 + }, + { + "epoch": 2.363231197771588, + "grad_norm": 1.7718360424041748, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8815606236457825, + "num_tokens": 463629743.0, + "step": 12726 + }, + { + "epoch": 2.363416898792943, + "grad_norm": 1.7845646142959595, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.886573076248169, + "num_tokens": 463662758.0, + "step": 12727 + }, + { + "epoch": 2.363602599814299, + "grad_norm": 1.518546462059021, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8838348984718323, + "num_tokens": 463705800.0, + "step": 12728 + }, + { + "epoch": 2.3637883008356546, + "grad_norm": 1.5233100652694702, + "learning_rate": 1e-06, + "loss": 0.267, + "mean_token_accuracy": 0.9012578129768372, + "num_tokens": 463743646.0, + "step": 12729 + }, + { + "epoch": 2.3639740018570103, + "grad_norm": 1.6598913669586182, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.884300708770752, + "num_tokens": 463777332.0, + "step": 12730 + }, + { + "epoch": 2.364159702878366, + "grad_norm": 1.4894750118255615, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8926327228546143, + "num_tokens": 463817139.0, + "step": 12731 + }, + { + "epoch": 2.3643454038997214, + "grad_norm": 1.437073826789856, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8847385048866272, + "num_tokens": 463860081.0, + "step": 12732 + }, + { + "epoch": 2.364531104921077, + "grad_norm": 1.8021799325942993, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.87239670753479, + "num_tokens": 463891351.0, + "step": 12733 + }, + { + "epoch": 2.364716805942433, + "grad_norm": 1.6559113264083862, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8777296543121338, + "num_tokens": 463929650.0, + "step": 12734 + }, + { + "epoch": 2.364902506963788, + "grad_norm": 1.7316373586654663, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8763584494590759, + "num_tokens": 463964438.0, + "step": 12735 + }, + { + "epoch": 2.365088207985144, + "grad_norm": 1.587230920791626, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8796379566192627, + "num_tokens": 464004094.0, + "step": 12736 + }, + { + "epoch": 2.3652739090064996, + "grad_norm": 1.8575245141983032, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8779898881912231, + "num_tokens": 464037531.0, + "step": 12737 + }, + { + "epoch": 2.3654596100278553, + "grad_norm": 1.638588547706604, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8737115263938904, + "num_tokens": 464075242.0, + "step": 12738 + }, + { + "epoch": 2.3656453110492106, + "grad_norm": 1.5651142597198486, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8795315027236938, + "num_tokens": 464118159.0, + "step": 12739 + }, + { + "epoch": 2.3658310120705663, + "grad_norm": 1.60536527633667, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8914347887039185, + "num_tokens": 464151004.0, + "step": 12740 + }, + { + "epoch": 2.366016713091922, + "grad_norm": 1.6433920860290527, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8930549621582031, + "num_tokens": 464187216.0, + "step": 12741 + }, + { + "epoch": 2.366202414113278, + "grad_norm": 1.6511859893798828, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8767231702804565, + "num_tokens": 464223904.0, + "step": 12742 + }, + { + "epoch": 2.366388115134633, + "grad_norm": 1.458377480506897, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8875128626823425, + "num_tokens": 464266131.0, + "step": 12743 + }, + { + "epoch": 2.366573816155989, + "grad_norm": 1.7789771556854248, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.878962516784668, + "num_tokens": 464300488.0, + "step": 12744 + }, + { + "epoch": 2.3667595171773446, + "grad_norm": 1.6149848699569702, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8849382400512695, + "num_tokens": 464339213.0, + "step": 12745 + }, + { + "epoch": 2.3669452181987003, + "grad_norm": 1.6317673921585083, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8905231356620789, + "num_tokens": 464373603.0, + "step": 12746 + }, + { + "epoch": 2.3671309192200556, + "grad_norm": 1.6025512218475342, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8951303958892822, + "num_tokens": 464414338.0, + "step": 12747 + }, + { + "epoch": 2.3673166202414113, + "grad_norm": 1.4872689247131348, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8862158060073853, + "num_tokens": 464455305.0, + "step": 12748 + }, + { + "epoch": 2.367502321262767, + "grad_norm": 1.6922624111175537, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.886663019657135, + "num_tokens": 464488936.0, + "step": 12749 + }, + { + "epoch": 2.3676880222841223, + "grad_norm": 1.6429204940795898, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8791829347610474, + "num_tokens": 464524702.0, + "step": 12750 + }, + { + "epoch": 2.367873723305478, + "grad_norm": 1.6726844310760498, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.88044273853302, + "num_tokens": 464556924.0, + "step": 12751 + }, + { + "epoch": 2.368059424326834, + "grad_norm": 1.531329870223999, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8853703141212463, + "num_tokens": 464598315.0, + "step": 12752 + }, + { + "epoch": 2.3682451253481895, + "grad_norm": 1.6304264068603516, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8837112188339233, + "num_tokens": 464635852.0, + "step": 12753 + }, + { + "epoch": 2.3684308263695453, + "grad_norm": 1.6635152101516724, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8820674419403076, + "num_tokens": 464671092.0, + "step": 12754 + }, + { + "epoch": 2.3686165273909006, + "grad_norm": 1.6732505559921265, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8938164710998535, + "num_tokens": 464703765.0, + "step": 12755 + }, + { + "epoch": 2.3688022284122563, + "grad_norm": 1.3825609683990479, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8981776237487793, + "num_tokens": 464747375.0, + "step": 12756 + }, + { + "epoch": 2.368987929433612, + "grad_norm": 1.6140140295028687, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8825754523277283, + "num_tokens": 464788063.0, + "step": 12757 + }, + { + "epoch": 2.3691736304549673, + "grad_norm": 1.722092628479004, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8771781325340271, + "num_tokens": 464821592.0, + "step": 12758 + }, + { + "epoch": 2.369359331476323, + "grad_norm": 1.650840163230896, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8802758455276489, + "num_tokens": 464858898.0, + "step": 12759 + }, + { + "epoch": 2.369545032497679, + "grad_norm": 1.508626103401184, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8953584432601929, + "num_tokens": 464897936.0, + "step": 12760 + }, + { + "epoch": 2.3697307335190345, + "grad_norm": 1.611952304840088, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8783401846885681, + "num_tokens": 464936516.0, + "step": 12761 + }, + { + "epoch": 2.36991643454039, + "grad_norm": 1.6259329319000244, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8875178098678589, + "num_tokens": 464974739.0, + "step": 12762 + }, + { + "epoch": 2.3701021355617455, + "grad_norm": 1.571110486984253, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.884462833404541, + "num_tokens": 465011944.0, + "step": 12763 + }, + { + "epoch": 2.3702878365831013, + "grad_norm": 1.656975507736206, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8869938254356384, + "num_tokens": 465048219.0, + "step": 12764 + }, + { + "epoch": 2.370473537604457, + "grad_norm": 1.6725746393203735, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8915424346923828, + "num_tokens": 465081853.0, + "step": 12765 + }, + { + "epoch": 2.3706592386258123, + "grad_norm": 1.5951943397521973, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8849345445632935, + "num_tokens": 465121674.0, + "step": 12766 + }, + { + "epoch": 2.370844939647168, + "grad_norm": 1.5810956954956055, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8687616586685181, + "num_tokens": 465162269.0, + "step": 12767 + }, + { + "epoch": 2.3710306406685238, + "grad_norm": 1.613236427307129, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8806106448173523, + "num_tokens": 465198146.0, + "step": 12768 + }, + { + "epoch": 2.3712163416898795, + "grad_norm": 1.507514238357544, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.8983131647109985, + "num_tokens": 465235772.0, + "step": 12769 + }, + { + "epoch": 2.371402042711235, + "grad_norm": 1.6431301832199097, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8792378306388855, + "num_tokens": 465272253.0, + "step": 12770 + }, + { + "epoch": 2.3715877437325905, + "grad_norm": 1.768537998199463, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8700259923934937, + "num_tokens": 465305463.0, + "step": 12771 + }, + { + "epoch": 2.3717734447539462, + "grad_norm": 1.6036843061447144, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.885425329208374, + "num_tokens": 465341452.0, + "step": 12772 + }, + { + "epoch": 2.3719591457753015, + "grad_norm": 1.6112669706344604, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8834874629974365, + "num_tokens": 465376673.0, + "step": 12773 + }, + { + "epoch": 2.3721448467966573, + "grad_norm": 1.5900146961212158, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8741655349731445, + "num_tokens": 465413608.0, + "step": 12774 + }, + { + "epoch": 2.372330547818013, + "grad_norm": 1.6241636276245117, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8863264322280884, + "num_tokens": 465449505.0, + "step": 12775 + }, + { + "epoch": 2.3725162488393687, + "grad_norm": 1.5065693855285645, + "learning_rate": 1e-06, + "loss": 0.2667, + "mean_token_accuracy": 0.902198314666748, + "num_tokens": 465486678.0, + "step": 12776 + }, + { + "epoch": 2.3727019498607245, + "grad_norm": 1.6099674701690674, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8755136728286743, + "num_tokens": 465527113.0, + "step": 12777 + }, + { + "epoch": 2.3728876508820798, + "grad_norm": 1.6484519243240356, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8927880525588989, + "num_tokens": 465558968.0, + "step": 12778 + }, + { + "epoch": 2.3730733519034355, + "grad_norm": 1.710200309753418, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8880271911621094, + "num_tokens": 465590052.0, + "step": 12779 + }, + { + "epoch": 2.3732590529247912, + "grad_norm": 1.701086401939392, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8616886734962463, + "num_tokens": 465628112.0, + "step": 12780 + }, + { + "epoch": 2.3734447539461465, + "grad_norm": 1.5497442483901978, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.890526533126831, + "num_tokens": 465664824.0, + "step": 12781 + }, + { + "epoch": 2.3736304549675022, + "grad_norm": 1.706903338432312, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.88323974609375, + "num_tokens": 465702130.0, + "step": 12782 + }, + { + "epoch": 2.373816155988858, + "grad_norm": 1.4653384685516357, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.8974301815032959, + "num_tokens": 465739428.0, + "step": 12783 + }, + { + "epoch": 2.3740018570102137, + "grad_norm": 1.5857839584350586, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8954492807388306, + "num_tokens": 465774070.0, + "step": 12784 + }, + { + "epoch": 2.374187558031569, + "grad_norm": 1.6389714479446411, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8929260969161987, + "num_tokens": 465808673.0, + "step": 12785 + }, + { + "epoch": 2.3743732590529247, + "grad_norm": 1.717180848121643, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8881867527961731, + "num_tokens": 465841333.0, + "step": 12786 + }, + { + "epoch": 2.3745589600742805, + "grad_norm": 1.6255697011947632, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8838033080101013, + "num_tokens": 465879500.0, + "step": 12787 + }, + { + "epoch": 2.374744661095636, + "grad_norm": 1.6023969650268555, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8702524900436401, + "num_tokens": 465919796.0, + "step": 12788 + }, + { + "epoch": 2.3749303621169915, + "grad_norm": 1.6232913732528687, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8866957426071167, + "num_tokens": 465959968.0, + "step": 12789 + }, + { + "epoch": 2.3751160631383472, + "grad_norm": 1.69331693649292, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.897856593132019, + "num_tokens": 465996291.0, + "step": 12790 + }, + { + "epoch": 2.375301764159703, + "grad_norm": 1.4971461296081543, + "learning_rate": 1e-06, + "loss": 0.2809, + "mean_token_accuracy": 0.8976559638977051, + "num_tokens": 466033509.0, + "step": 12791 + }, + { + "epoch": 2.3754874651810587, + "grad_norm": 1.5689667463302612, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.887046217918396, + "num_tokens": 466068268.0, + "step": 12792 + }, + { + "epoch": 2.375673166202414, + "grad_norm": 1.6044018268585205, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8797600269317627, + "num_tokens": 466107106.0, + "step": 12793 + }, + { + "epoch": 2.3758588672237697, + "grad_norm": 1.7225927114486694, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8728698492050171, + "num_tokens": 466145355.0, + "step": 12794 + }, + { + "epoch": 2.3760445682451254, + "grad_norm": 1.8546031713485718, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8768288493156433, + "num_tokens": 466177147.0, + "step": 12795 + }, + { + "epoch": 2.3762302692664807, + "grad_norm": 1.6725000143051147, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8823703527450562, + "num_tokens": 466210514.0, + "step": 12796 + }, + { + "epoch": 2.3764159702878365, + "grad_norm": 1.7491073608398438, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8949424028396606, + "num_tokens": 466245038.0, + "step": 12797 + }, + { + "epoch": 2.376601671309192, + "grad_norm": 1.6700868606567383, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8959686756134033, + "num_tokens": 466275907.0, + "step": 12798 + }, + { + "epoch": 2.376787372330548, + "grad_norm": 1.750728964805603, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.896482527256012, + "num_tokens": 466309384.0, + "step": 12799 + }, + { + "epoch": 2.3769730733519037, + "grad_norm": 1.7948360443115234, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8741345405578613, + "num_tokens": 466342594.0, + "step": 12800 + }, + { + "epoch": 2.377158774373259, + "grad_norm": 1.5238091945648193, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8922068476676941, + "num_tokens": 466379704.0, + "step": 12801 + }, + { + "epoch": 2.3773444753946147, + "grad_norm": 1.5484822988510132, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8734592199325562, + "num_tokens": 466417246.0, + "step": 12802 + }, + { + "epoch": 2.3775301764159704, + "grad_norm": 1.7760076522827148, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8756720423698425, + "num_tokens": 466450003.0, + "step": 12803 + }, + { + "epoch": 2.3777158774373257, + "grad_norm": 1.6780551671981812, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8722940683364868, + "num_tokens": 466491566.0, + "step": 12804 + }, + { + "epoch": 2.3779015784586814, + "grad_norm": 1.6797114610671997, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8995885848999023, + "num_tokens": 466524111.0, + "step": 12805 + }, + { + "epoch": 2.378087279480037, + "grad_norm": 1.8835567235946655, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8708719611167908, + "num_tokens": 466559771.0, + "step": 12806 + }, + { + "epoch": 2.378272980501393, + "grad_norm": 1.646545171737671, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8817002773284912, + "num_tokens": 466595869.0, + "step": 12807 + }, + { + "epoch": 2.378458681522748, + "grad_norm": 1.956745982170105, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8935413360595703, + "num_tokens": 466623463.0, + "step": 12808 + }, + { + "epoch": 2.378644382544104, + "grad_norm": 1.736084222793579, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8800383806228638, + "num_tokens": 466655898.0, + "step": 12809 + }, + { + "epoch": 2.3788300835654597, + "grad_norm": 1.5493603944778442, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8846132755279541, + "num_tokens": 466696012.0, + "step": 12810 + }, + { + "epoch": 2.3790157845868154, + "grad_norm": 1.6170579195022583, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8863964080810547, + "num_tokens": 466731400.0, + "step": 12811 + }, + { + "epoch": 2.3792014856081707, + "grad_norm": 1.7463988065719604, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8950916528701782, + "num_tokens": 466763071.0, + "step": 12812 + }, + { + "epoch": 2.3793871866295264, + "grad_norm": 1.5974977016448975, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.87430340051651, + "num_tokens": 466802205.0, + "step": 12813 + }, + { + "epoch": 2.379572887650882, + "grad_norm": 1.5764474868774414, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8889938592910767, + "num_tokens": 466838762.0, + "step": 12814 + }, + { + "epoch": 2.379758588672238, + "grad_norm": 1.6786881685256958, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8910275101661682, + "num_tokens": 466876265.0, + "step": 12815 + }, + { + "epoch": 2.379944289693593, + "grad_norm": 1.721985936164856, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8764974474906921, + "num_tokens": 466912660.0, + "step": 12816 + }, + { + "epoch": 2.380129990714949, + "grad_norm": 1.4462776184082031, + "learning_rate": 1e-06, + "loss": 0.2777, + "mean_token_accuracy": 0.9011180996894836, + "num_tokens": 466956823.0, + "step": 12817 + }, + { + "epoch": 2.3803156917363046, + "grad_norm": 1.6761221885681152, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8754164576530457, + "num_tokens": 466990808.0, + "step": 12818 + }, + { + "epoch": 2.38050139275766, + "grad_norm": 1.755469799041748, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8752429485321045, + "num_tokens": 467022706.0, + "step": 12819 + }, + { + "epoch": 2.3806870937790157, + "grad_norm": 1.513108253479004, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8889884948730469, + "num_tokens": 467060476.0, + "step": 12820 + }, + { + "epoch": 2.3808727948003714, + "grad_norm": 1.5616549253463745, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8846539258956909, + "num_tokens": 467101821.0, + "step": 12821 + }, + { + "epoch": 2.381058495821727, + "grad_norm": 1.565720796585083, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8886371850967407, + "num_tokens": 467139345.0, + "step": 12822 + }, + { + "epoch": 2.381244196843083, + "grad_norm": 1.7403696775436401, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8832569122314453, + "num_tokens": 467176205.0, + "step": 12823 + }, + { + "epoch": 2.381429897864438, + "grad_norm": 1.7038052082061768, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8720521926879883, + "num_tokens": 467212578.0, + "step": 12824 + }, + { + "epoch": 2.381615598885794, + "grad_norm": 1.626494288444519, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8902971744537354, + "num_tokens": 467248087.0, + "step": 12825 + }, + { + "epoch": 2.3818012999071496, + "grad_norm": 1.609511375427246, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8977455496788025, + "num_tokens": 467280948.0, + "step": 12826 + }, + { + "epoch": 2.381987000928505, + "grad_norm": 1.6185581684112549, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.881352424621582, + "num_tokens": 467319376.0, + "step": 12827 + }, + { + "epoch": 2.3821727019498606, + "grad_norm": 1.6990677118301392, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8803805112838745, + "num_tokens": 467355841.0, + "step": 12828 + }, + { + "epoch": 2.3823584029712164, + "grad_norm": 1.7346981763839722, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8879764080047607, + "num_tokens": 467386998.0, + "step": 12829 + }, + { + "epoch": 2.382544103992572, + "grad_norm": 1.6046441793441772, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8852822780609131, + "num_tokens": 467419691.0, + "step": 12830 + }, + { + "epoch": 2.3827298050139274, + "grad_norm": 1.734602928161621, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8843483924865723, + "num_tokens": 467454163.0, + "step": 12831 + }, + { + "epoch": 2.382915506035283, + "grad_norm": 1.591821312904358, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8958398699760437, + "num_tokens": 467490879.0, + "step": 12832 + }, + { + "epoch": 2.383101207056639, + "grad_norm": 1.736716389656067, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8950086832046509, + "num_tokens": 467521643.0, + "step": 12833 + }, + { + "epoch": 2.3832869080779946, + "grad_norm": 1.5776909589767456, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8945580124855042, + "num_tokens": 467561483.0, + "step": 12834 + }, + { + "epoch": 2.38347260909935, + "grad_norm": 1.5448437929153442, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8921471238136292, + "num_tokens": 467600302.0, + "step": 12835 + }, + { + "epoch": 2.3836583101207056, + "grad_norm": 1.4852439165115356, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8957663774490356, + "num_tokens": 467638252.0, + "step": 12836 + }, + { + "epoch": 2.3838440111420613, + "grad_norm": 1.590175986289978, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8882277011871338, + "num_tokens": 467674994.0, + "step": 12837 + }, + { + "epoch": 2.384029712163417, + "grad_norm": 1.4407764673233032, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8914946913719177, + "num_tokens": 467720062.0, + "step": 12838 + }, + { + "epoch": 2.3842154131847724, + "grad_norm": 1.4922406673431396, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8977600336074829, + "num_tokens": 467759478.0, + "step": 12839 + }, + { + "epoch": 2.384401114206128, + "grad_norm": 1.5382665395736694, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8856338262557983, + "num_tokens": 467797452.0, + "step": 12840 + }, + { + "epoch": 2.384586815227484, + "grad_norm": 1.4433408975601196, + "learning_rate": 1e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.9012337923049927, + "num_tokens": 467837539.0, + "step": 12841 + }, + { + "epoch": 2.384772516248839, + "grad_norm": 1.5048916339874268, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8928795456886292, + "num_tokens": 467877228.0, + "step": 12842 + }, + { + "epoch": 2.384958217270195, + "grad_norm": 1.519813060760498, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8823408484458923, + "num_tokens": 467918428.0, + "step": 12843 + }, + { + "epoch": 2.3851439182915506, + "grad_norm": 1.630470633506775, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8912250995635986, + "num_tokens": 467951356.0, + "step": 12844 + }, + { + "epoch": 2.3853296193129063, + "grad_norm": 1.3662561178207397, + "learning_rate": 1e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.8996809124946594, + "num_tokens": 467992516.0, + "step": 12845 + }, + { + "epoch": 2.385515320334262, + "grad_norm": 1.563194751739502, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8940488696098328, + "num_tokens": 468029795.0, + "step": 12846 + }, + { + "epoch": 2.3857010213556173, + "grad_norm": 1.638135552406311, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8810694813728333, + "num_tokens": 468065365.0, + "step": 12847 + }, + { + "epoch": 2.385886722376973, + "grad_norm": 1.678167700767517, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8817445635795593, + "num_tokens": 468100657.0, + "step": 12848 + }, + { + "epoch": 2.386072423398329, + "grad_norm": 1.738208532333374, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8881518244743347, + "num_tokens": 468131882.0, + "step": 12849 + }, + { + "epoch": 2.386258124419684, + "grad_norm": 1.546614646911621, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8860974907875061, + "num_tokens": 468170082.0, + "step": 12850 + }, + { + "epoch": 2.38644382544104, + "grad_norm": 1.5581284761428833, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8852337598800659, + "num_tokens": 468207798.0, + "step": 12851 + }, + { + "epoch": 2.3866295264623956, + "grad_norm": 1.495491623878479, + "learning_rate": 1e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.8987598419189453, + "num_tokens": 468245231.0, + "step": 12852 + }, + { + "epoch": 2.3868152274837513, + "grad_norm": 1.6427503824234009, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8884414434432983, + "num_tokens": 468285006.0, + "step": 12853 + }, + { + "epoch": 2.3870009285051066, + "grad_norm": 1.5015982389450073, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8941909670829773, + "num_tokens": 468323084.0, + "step": 12854 + }, + { + "epoch": 2.3871866295264623, + "grad_norm": 1.6009865999221802, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8893119692802429, + "num_tokens": 468356398.0, + "step": 12855 + }, + { + "epoch": 2.387372330547818, + "grad_norm": 1.568644404411316, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8923559188842773, + "num_tokens": 468392267.0, + "step": 12856 + }, + { + "epoch": 2.387558031569174, + "grad_norm": 1.8824951648712158, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8824276328086853, + "num_tokens": 468419913.0, + "step": 12857 + }, + { + "epoch": 2.387743732590529, + "grad_norm": 1.6461923122406006, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8874688148498535, + "num_tokens": 468455126.0, + "step": 12858 + }, + { + "epoch": 2.387929433611885, + "grad_norm": 1.6210805177688599, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8788209557533264, + "num_tokens": 468491306.0, + "step": 12859 + }, + { + "epoch": 2.3881151346332405, + "grad_norm": 1.6185188293457031, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8848767280578613, + "num_tokens": 468530575.0, + "step": 12860 + }, + { + "epoch": 2.3883008356545963, + "grad_norm": 1.6496424674987793, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8984907865524292, + "num_tokens": 468564572.0, + "step": 12861 + }, + { + "epoch": 2.3884865366759516, + "grad_norm": 1.7412606477737427, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8880810141563416, + "num_tokens": 468600065.0, + "step": 12862 + }, + { + "epoch": 2.3886722376973073, + "grad_norm": 1.6298129558563232, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8795892000198364, + "num_tokens": 468637238.0, + "step": 12863 + }, + { + "epoch": 2.388857938718663, + "grad_norm": 1.603280782699585, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8874167799949646, + "num_tokens": 468675603.0, + "step": 12864 + }, + { + "epoch": 2.3890436397400183, + "grad_norm": 1.556051254272461, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.8978524208068848, + "num_tokens": 468713024.0, + "step": 12865 + }, + { + "epoch": 2.389229340761374, + "grad_norm": 1.6059850454330444, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8780189752578735, + "num_tokens": 468751186.0, + "step": 12866 + }, + { + "epoch": 2.38941504178273, + "grad_norm": 1.8469194173812866, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8767468929290771, + "num_tokens": 468785640.0, + "step": 12867 + }, + { + "epoch": 2.3896007428040855, + "grad_norm": 1.5997580289840698, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8852108120918274, + "num_tokens": 468821245.0, + "step": 12868 + }, + { + "epoch": 2.3897864438254413, + "grad_norm": 1.6282585859298706, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8911541700363159, + "num_tokens": 468854671.0, + "step": 12869 + }, + { + "epoch": 2.3899721448467965, + "grad_norm": 1.4876999855041504, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8862870931625366, + "num_tokens": 468896647.0, + "step": 12870 + }, + { + "epoch": 2.3901578458681523, + "grad_norm": 1.7044912576675415, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8797698020935059, + "num_tokens": 468931361.0, + "step": 12871 + }, + { + "epoch": 2.390343546889508, + "grad_norm": 1.4941842555999756, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8840956687927246, + "num_tokens": 468974827.0, + "step": 12872 + }, + { + "epoch": 2.3905292479108633, + "grad_norm": 1.518995761871338, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8795880079269409, + "num_tokens": 469013429.0, + "step": 12873 + }, + { + "epoch": 2.390714948932219, + "grad_norm": 1.660091519355774, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8903117179870605, + "num_tokens": 469046145.0, + "step": 12874 + }, + { + "epoch": 2.3909006499535748, + "grad_norm": 1.7302943468093872, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8869330883026123, + "num_tokens": 469080720.0, + "step": 12875 + }, + { + "epoch": 2.3910863509749305, + "grad_norm": 1.6244186162948608, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8821638822555542, + "num_tokens": 469118620.0, + "step": 12876 + }, + { + "epoch": 2.391272051996286, + "grad_norm": 1.7184481620788574, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8836318254470825, + "num_tokens": 469154512.0, + "step": 12877 + }, + { + "epoch": 2.3914577530176415, + "grad_norm": 1.6220462322235107, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8907196521759033, + "num_tokens": 469192889.0, + "step": 12878 + }, + { + "epoch": 2.3916434540389973, + "grad_norm": 1.5382757186889648, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8932932615280151, + "num_tokens": 469233483.0, + "step": 12879 + }, + { + "epoch": 2.391829155060353, + "grad_norm": 1.6567797660827637, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8926506042480469, + "num_tokens": 469267171.0, + "step": 12880 + }, + { + "epoch": 2.3920148560817083, + "grad_norm": 1.7059050798416138, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8871995210647583, + "num_tokens": 469300243.0, + "step": 12881 + }, + { + "epoch": 2.392200557103064, + "grad_norm": 1.683242678642273, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8729782104492188, + "num_tokens": 469334647.0, + "step": 12882 + }, + { + "epoch": 2.3923862581244197, + "grad_norm": 1.5955761671066284, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8715872764587402, + "num_tokens": 469373335.0, + "step": 12883 + }, + { + "epoch": 2.3925719591457755, + "grad_norm": 1.599552035331726, + "learning_rate": 1e-06, + "loss": 0.2694, + "mean_token_accuracy": 0.9009929895401001, + "num_tokens": 469407621.0, + "step": 12884 + }, + { + "epoch": 2.3927576601671308, + "grad_norm": 1.8550446033477783, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8817599415779114, + "num_tokens": 469439555.0, + "step": 12885 + }, + { + "epoch": 2.3929433611884865, + "grad_norm": 1.5765107870101929, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8851606249809265, + "num_tokens": 469476180.0, + "step": 12886 + }, + { + "epoch": 2.3931290622098422, + "grad_norm": 1.6051909923553467, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8985199928283691, + "num_tokens": 469511969.0, + "step": 12887 + }, + { + "epoch": 2.393314763231198, + "grad_norm": 1.532678484916687, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.862505316734314, + "num_tokens": 469557038.0, + "step": 12888 + }, + { + "epoch": 2.3935004642525533, + "grad_norm": 1.5506540536880493, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8863471746444702, + "num_tokens": 469594569.0, + "step": 12889 + }, + { + "epoch": 2.393686165273909, + "grad_norm": 1.6988095045089722, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8821209669113159, + "num_tokens": 469631740.0, + "step": 12890 + }, + { + "epoch": 2.3938718662952647, + "grad_norm": 1.5764096975326538, + "learning_rate": 1e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.901559591293335, + "num_tokens": 469668408.0, + "step": 12891 + }, + { + "epoch": 2.3940575673166204, + "grad_norm": 1.5313191413879395, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8957371711730957, + "num_tokens": 469703975.0, + "step": 12892 + }, + { + "epoch": 2.3942432683379757, + "grad_norm": 1.6021791696548462, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8865953087806702, + "num_tokens": 469741601.0, + "step": 12893 + }, + { + "epoch": 2.3944289693593315, + "grad_norm": 1.565271019935608, + "learning_rate": 1e-06, + "loss": 0.2839, + "mean_token_accuracy": 0.8977682590484619, + "num_tokens": 469776183.0, + "step": 12894 + }, + { + "epoch": 2.394614670380687, + "grad_norm": 1.6086400747299194, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.887107789516449, + "num_tokens": 469811892.0, + "step": 12895 + }, + { + "epoch": 2.3948003714020425, + "grad_norm": 1.604891300201416, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8884355425834656, + "num_tokens": 469850545.0, + "step": 12896 + }, + { + "epoch": 2.3949860724233982, + "grad_norm": 1.5455690622329712, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8908547163009644, + "num_tokens": 469888713.0, + "step": 12897 + }, + { + "epoch": 2.395171773444754, + "grad_norm": 1.713318943977356, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8730897903442383, + "num_tokens": 469921893.0, + "step": 12898 + }, + { + "epoch": 2.3953574744661097, + "grad_norm": 1.591936469078064, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8890911340713501, + "num_tokens": 469958094.0, + "step": 12899 + }, + { + "epoch": 2.3955431754874654, + "grad_norm": 1.6111968755722046, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.882297158241272, + "num_tokens": 469996255.0, + "step": 12900 + }, + { + "epoch": 2.3957288765088207, + "grad_norm": 1.8310872316360474, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8935337662696838, + "num_tokens": 470026764.0, + "step": 12901 + }, + { + "epoch": 2.3959145775301764, + "grad_norm": 1.69602370262146, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8830165863037109, + "num_tokens": 470062505.0, + "step": 12902 + }, + { + "epoch": 2.396100278551532, + "grad_norm": 1.586777687072754, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.878031849861145, + "num_tokens": 470101086.0, + "step": 12903 + }, + { + "epoch": 2.3962859795728875, + "grad_norm": 1.6225814819335938, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8738971948623657, + "num_tokens": 470140619.0, + "step": 12904 + }, + { + "epoch": 2.396471680594243, + "grad_norm": 1.647192120552063, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8886553049087524, + "num_tokens": 470179438.0, + "step": 12905 + }, + { + "epoch": 2.396657381615599, + "grad_norm": 1.508927583694458, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8865243792533875, + "num_tokens": 470221518.0, + "step": 12906 + }, + { + "epoch": 2.3968430826369547, + "grad_norm": 1.6015706062316895, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.892125129699707, + "num_tokens": 470256359.0, + "step": 12907 + }, + { + "epoch": 2.39702878365831, + "grad_norm": 1.601323127746582, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8812071084976196, + "num_tokens": 470296274.0, + "step": 12908 + }, + { + "epoch": 2.3972144846796657, + "grad_norm": 1.7105472087860107, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8718618750572205, + "num_tokens": 470335075.0, + "step": 12909 + }, + { + "epoch": 2.3974001857010214, + "grad_norm": 1.9252713918685913, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.876273512840271, + "num_tokens": 470368779.0, + "step": 12910 + }, + { + "epoch": 2.397585886722377, + "grad_norm": 1.5570201873779297, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8902846574783325, + "num_tokens": 470405415.0, + "step": 12911 + }, + { + "epoch": 2.3977715877437324, + "grad_norm": 1.5805490016937256, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.881360650062561, + "num_tokens": 470444907.0, + "step": 12912 + }, + { + "epoch": 2.397957288765088, + "grad_norm": 1.668065071105957, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8811578154563904, + "num_tokens": 470479621.0, + "step": 12913 + }, + { + "epoch": 2.398142989786444, + "grad_norm": 1.614791750907898, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.881271243095398, + "num_tokens": 470519751.0, + "step": 12914 + }, + { + "epoch": 2.3983286908077996, + "grad_norm": 1.5111045837402344, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8840894103050232, + "num_tokens": 470559963.0, + "step": 12915 + }, + { + "epoch": 2.398514391829155, + "grad_norm": 1.7795392274856567, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8770887851715088, + "num_tokens": 470591908.0, + "step": 12916 + }, + { + "epoch": 2.3987000928505107, + "grad_norm": 1.7606827020645142, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8832333087921143, + "num_tokens": 470624499.0, + "step": 12917 + }, + { + "epoch": 2.3988857938718664, + "grad_norm": 1.6268624067306519, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.890185534954071, + "num_tokens": 470659708.0, + "step": 12918 + }, + { + "epoch": 2.3990714948932217, + "grad_norm": 1.7789431810379028, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8768439292907715, + "num_tokens": 470691189.0, + "step": 12919 + }, + { + "epoch": 2.3992571959145774, + "grad_norm": 1.5334579944610596, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8955444097518921, + "num_tokens": 470729238.0, + "step": 12920 + }, + { + "epoch": 2.399442896935933, + "grad_norm": 1.4978622198104858, + "learning_rate": 1e-06, + "loss": 0.2627, + "mean_token_accuracy": 0.9043874740600586, + "num_tokens": 470766634.0, + "step": 12921 + }, + { + "epoch": 2.399628597957289, + "grad_norm": 1.5770155191421509, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8853858709335327, + "num_tokens": 470805882.0, + "step": 12922 + }, + { + "epoch": 2.3998142989786446, + "grad_norm": 1.3739393949508667, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8940216302871704, + "num_tokens": 470851309.0, + "step": 12923 + }, + { + "epoch": 2.4, + "grad_norm": 1.6226917505264282, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8920103311538696, + "num_tokens": 470886122.0, + "step": 12924 + }, + { + "epoch": 2.4001857010213556, + "grad_norm": 1.5078320503234863, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.867060661315918, + "num_tokens": 470934848.0, + "step": 12925 + }, + { + "epoch": 2.4003714020427114, + "grad_norm": 1.5783469676971436, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8976171612739563, + "num_tokens": 470969327.0, + "step": 12926 + }, + { + "epoch": 2.4005571030640667, + "grad_norm": 1.5987303256988525, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8792625665664673, + "num_tokens": 471007125.0, + "step": 12927 + }, + { + "epoch": 2.4007428040854224, + "grad_norm": 1.6064023971557617, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.891203761100769, + "num_tokens": 471041984.0, + "step": 12928 + }, + { + "epoch": 2.400928505106778, + "grad_norm": 1.5933539867401123, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8814800977706909, + "num_tokens": 471079220.0, + "step": 12929 + }, + { + "epoch": 2.401114206128134, + "grad_norm": 1.6217790842056274, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8826775550842285, + "num_tokens": 471119455.0, + "step": 12930 + }, + { + "epoch": 2.401299907149489, + "grad_norm": 1.6924716234207153, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.8999831676483154, + "num_tokens": 471150736.0, + "step": 12931 + }, + { + "epoch": 2.401485608170845, + "grad_norm": 1.6264983415603638, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8795208930969238, + "num_tokens": 471185860.0, + "step": 12932 + }, + { + "epoch": 2.4016713091922006, + "grad_norm": 1.693184733390808, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8735800981521606, + "num_tokens": 471222899.0, + "step": 12933 + }, + { + "epoch": 2.4018570102135564, + "grad_norm": 1.6724892854690552, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8959301710128784, + "num_tokens": 471256863.0, + "step": 12934 + }, + { + "epoch": 2.4020427112349116, + "grad_norm": 1.63113534450531, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8962118029594421, + "num_tokens": 471291336.0, + "step": 12935 + }, + { + "epoch": 2.4022284122562674, + "grad_norm": 1.6191720962524414, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8868289589881897, + "num_tokens": 471328254.0, + "step": 12936 + }, + { + "epoch": 2.402414113277623, + "grad_norm": 1.8682260513305664, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8823363780975342, + "num_tokens": 471359223.0, + "step": 12937 + }, + { + "epoch": 2.402599814298979, + "grad_norm": 1.7789405584335327, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8843857049942017, + "num_tokens": 471391510.0, + "step": 12938 + }, + { + "epoch": 2.402785515320334, + "grad_norm": 1.6192127466201782, + "learning_rate": 1e-06, + "loss": 0.2769, + "mean_token_accuracy": 0.898422122001648, + "num_tokens": 471426336.0, + "step": 12939 + }, + { + "epoch": 2.40297121634169, + "grad_norm": 1.533494472503662, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8939954042434692, + "num_tokens": 471465353.0, + "step": 12940 + }, + { + "epoch": 2.4031569173630456, + "grad_norm": 1.6456985473632812, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8729914426803589, + "num_tokens": 471505126.0, + "step": 12941 + }, + { + "epoch": 2.403342618384401, + "grad_norm": 1.5779387950897217, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8777045011520386, + "num_tokens": 471543590.0, + "step": 12942 + }, + { + "epoch": 2.4035283194057566, + "grad_norm": 1.7328369617462158, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8844858407974243, + "num_tokens": 471576665.0, + "step": 12943 + }, + { + "epoch": 2.4037140204271124, + "grad_norm": 1.557727336883545, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8829834461212158, + "num_tokens": 471613893.0, + "step": 12944 + }, + { + "epoch": 2.403899721448468, + "grad_norm": 1.6655491590499878, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.8962978720664978, + "num_tokens": 471645146.0, + "step": 12945 + }, + { + "epoch": 2.404085422469824, + "grad_norm": 1.536604881286621, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8879751563072205, + "num_tokens": 471688497.0, + "step": 12946 + }, + { + "epoch": 2.404271123491179, + "grad_norm": 1.6476597785949707, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8917170763015747, + "num_tokens": 471720382.0, + "step": 12947 + }, + { + "epoch": 2.404456824512535, + "grad_norm": 1.5637474060058594, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8925437927246094, + "num_tokens": 471757861.0, + "step": 12948 + }, + { + "epoch": 2.4046425255338906, + "grad_norm": 1.6279850006103516, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.877971887588501, + "num_tokens": 471794714.0, + "step": 12949 + }, + { + "epoch": 2.404828226555246, + "grad_norm": 1.738781213760376, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8849619626998901, + "num_tokens": 471827326.0, + "step": 12950 + }, + { + "epoch": 2.4050139275766016, + "grad_norm": 1.6171197891235352, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8859496116638184, + "num_tokens": 471867896.0, + "step": 12951 + }, + { + "epoch": 2.4051996285979573, + "grad_norm": 1.6020509004592896, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8775753974914551, + "num_tokens": 471907390.0, + "step": 12952 + }, + { + "epoch": 2.405385329619313, + "grad_norm": 1.542351245880127, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8958777785301208, + "num_tokens": 471944159.0, + "step": 12953 + }, + { + "epoch": 2.4055710306406684, + "grad_norm": 1.5740749835968018, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8872661590576172, + "num_tokens": 471981242.0, + "step": 12954 + }, + { + "epoch": 2.405756731662024, + "grad_norm": 1.4780683517456055, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8941477537155151, + "num_tokens": 472019004.0, + "step": 12955 + }, + { + "epoch": 2.40594243268338, + "grad_norm": 1.4818087816238403, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8976799249649048, + "num_tokens": 472061526.0, + "step": 12956 + }, + { + "epoch": 2.4061281337047355, + "grad_norm": 1.485265851020813, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8925800323486328, + "num_tokens": 472101596.0, + "step": 12957 + }, + { + "epoch": 2.406313834726091, + "grad_norm": 1.6387168169021606, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8871561288833618, + "num_tokens": 472137758.0, + "step": 12958 + }, + { + "epoch": 2.4064995357474466, + "grad_norm": 1.6926114559173584, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.898912250995636, + "num_tokens": 472172940.0, + "step": 12959 + }, + { + "epoch": 2.4066852367688023, + "grad_norm": 1.603981852531433, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8956731557846069, + "num_tokens": 472209785.0, + "step": 12960 + }, + { + "epoch": 2.406870937790158, + "grad_norm": 1.7012919187545776, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.88458651304245, + "num_tokens": 472245546.0, + "step": 12961 + }, + { + "epoch": 2.4070566388115133, + "grad_norm": 1.5181642770767212, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8869485855102539, + "num_tokens": 472283814.0, + "step": 12962 + }, + { + "epoch": 2.407242339832869, + "grad_norm": 1.6130629777908325, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.887316107749939, + "num_tokens": 472318634.0, + "step": 12963 + }, + { + "epoch": 2.407428040854225, + "grad_norm": 1.6740291118621826, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8832626938819885, + "num_tokens": 472353783.0, + "step": 12964 + }, + { + "epoch": 2.40761374187558, + "grad_norm": 1.6590172052383423, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8819104433059692, + "num_tokens": 472388282.0, + "step": 12965 + }, + { + "epoch": 2.407799442896936, + "grad_norm": 1.7307332754135132, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8647080063819885, + "num_tokens": 472426873.0, + "step": 12966 + }, + { + "epoch": 2.4079851439182915, + "grad_norm": 1.583336591720581, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8876512050628662, + "num_tokens": 472464664.0, + "step": 12967 + }, + { + "epoch": 2.4081708449396473, + "grad_norm": 1.732569694519043, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8735815286636353, + "num_tokens": 472499094.0, + "step": 12968 + }, + { + "epoch": 2.408356545961003, + "grad_norm": 1.5561760663986206, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8884068131446838, + "num_tokens": 472539196.0, + "step": 12969 + }, + { + "epoch": 2.4085422469823583, + "grad_norm": 1.5519511699676514, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8809309601783752, + "num_tokens": 472578714.0, + "step": 12970 + }, + { + "epoch": 2.408727948003714, + "grad_norm": 1.589656114578247, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8879587054252625, + "num_tokens": 472612606.0, + "step": 12971 + }, + { + "epoch": 2.4089136490250698, + "grad_norm": 1.573351263999939, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8825448751449585, + "num_tokens": 472649844.0, + "step": 12972 + }, + { + "epoch": 2.409099350046425, + "grad_norm": 1.6297187805175781, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8919119834899902, + "num_tokens": 472682073.0, + "step": 12973 + }, + { + "epoch": 2.409285051067781, + "grad_norm": 1.4729304313659668, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.884820818901062, + "num_tokens": 472721750.0, + "step": 12974 + }, + { + "epoch": 2.4094707520891365, + "grad_norm": 1.8135274648666382, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8760114908218384, + "num_tokens": 472752640.0, + "step": 12975 + }, + { + "epoch": 2.4096564531104923, + "grad_norm": 1.7225494384765625, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8883872628211975, + "num_tokens": 472786560.0, + "step": 12976 + }, + { + "epoch": 2.4098421541318475, + "grad_norm": 1.5834691524505615, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8884092569351196, + "num_tokens": 472826632.0, + "step": 12977 + }, + { + "epoch": 2.4100278551532033, + "grad_norm": 1.8173243999481201, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8824402689933777, + "num_tokens": 472855680.0, + "step": 12978 + }, + { + "epoch": 2.410213556174559, + "grad_norm": 1.582853078842163, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8936396241188049, + "num_tokens": 472889006.0, + "step": 12979 + }, + { + "epoch": 2.4103992571959147, + "grad_norm": 1.6643235683441162, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8556978106498718, + "num_tokens": 472924897.0, + "step": 12980 + }, + { + "epoch": 2.41058495821727, + "grad_norm": 1.5382499694824219, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8836438655853271, + "num_tokens": 472967311.0, + "step": 12981 + }, + { + "epoch": 2.4107706592386258, + "grad_norm": 1.6639950275421143, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8887605667114258, + "num_tokens": 472999917.0, + "step": 12982 + }, + { + "epoch": 2.4109563602599815, + "grad_norm": 1.6042027473449707, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8945526480674744, + "num_tokens": 473037135.0, + "step": 12983 + }, + { + "epoch": 2.4111420612813372, + "grad_norm": 1.7424025535583496, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8804929256439209, + "num_tokens": 473072888.0, + "step": 12984 + }, + { + "epoch": 2.4113277623026925, + "grad_norm": 1.6556594371795654, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8742351531982422, + "num_tokens": 473111651.0, + "step": 12985 + }, + { + "epoch": 2.4115134633240483, + "grad_norm": 1.7537429332733154, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8924959301948547, + "num_tokens": 473139971.0, + "step": 12986 + }, + { + "epoch": 2.411699164345404, + "grad_norm": 1.7278720140457153, + "learning_rate": 1e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.8954975605010986, + "num_tokens": 473170046.0, + "step": 12987 + }, + { + "epoch": 2.4118848653667593, + "grad_norm": 1.627040147781372, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8913902044296265, + "num_tokens": 473209621.0, + "step": 12988 + }, + { + "epoch": 2.412070566388115, + "grad_norm": 1.8084332942962646, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.894659161567688, + "num_tokens": 473240524.0, + "step": 12989 + }, + { + "epoch": 2.4122562674094707, + "grad_norm": 1.7212351560592651, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8881303071975708, + "num_tokens": 473274414.0, + "step": 12990 + }, + { + "epoch": 2.4124419684308265, + "grad_norm": 1.652773141860962, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8830139636993408, + "num_tokens": 473310759.0, + "step": 12991 + }, + { + "epoch": 2.412627669452182, + "grad_norm": 1.4832048416137695, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8896345496177673, + "num_tokens": 473352233.0, + "step": 12992 + }, + { + "epoch": 2.4128133704735375, + "grad_norm": 1.6782445907592773, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8880578875541687, + "num_tokens": 473387329.0, + "step": 12993 + }, + { + "epoch": 2.4129990714948932, + "grad_norm": 1.5425797700881958, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8979897499084473, + "num_tokens": 473424269.0, + "step": 12994 + }, + { + "epoch": 2.413184772516249, + "grad_norm": 1.6287319660186768, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8900882601737976, + "num_tokens": 473459697.0, + "step": 12995 + }, + { + "epoch": 2.4133704735376043, + "grad_norm": 1.539808750152588, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8739652037620544, + "num_tokens": 473502044.0, + "step": 12996 + }, + { + "epoch": 2.41355617455896, + "grad_norm": 1.7735494375228882, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8922634124755859, + "num_tokens": 473533564.0, + "step": 12997 + }, + { + "epoch": 2.4137418755803157, + "grad_norm": 1.4789226055145264, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8949702382087708, + "num_tokens": 473570646.0, + "step": 12998 + }, + { + "epoch": 2.4139275766016715, + "grad_norm": 1.7206811904907227, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8912927508354187, + "num_tokens": 473600644.0, + "step": 12999 + }, + { + "epoch": 2.4141132776230267, + "grad_norm": 1.5772002935409546, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8945146799087524, + "num_tokens": 473636343.0, + "step": 13000 + }, + { + "epoch": 2.4142989786443825, + "grad_norm": 1.6415032148361206, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8909155130386353, + "num_tokens": 473674304.0, + "step": 13001 + }, + { + "epoch": 2.414484679665738, + "grad_norm": 1.5831352472305298, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8848126530647278, + "num_tokens": 473712561.0, + "step": 13002 + }, + { + "epoch": 2.414670380687094, + "grad_norm": 1.607367753982544, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8880588412284851, + "num_tokens": 473750176.0, + "step": 13003 + }, + { + "epoch": 2.4148560817084492, + "grad_norm": 1.4932141304016113, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8973363041877747, + "num_tokens": 473790292.0, + "step": 13004 + }, + { + "epoch": 2.415041782729805, + "grad_norm": 1.6009328365325928, + "learning_rate": 1e-06, + "loss": 0.2626, + "mean_token_accuracy": 0.9029865860939026, + "num_tokens": 473822361.0, + "step": 13005 + }, + { + "epoch": 2.4152274837511607, + "grad_norm": 1.524688720703125, + "learning_rate": 1e-06, + "loss": 0.2787, + "mean_token_accuracy": 0.8981629610061646, + "num_tokens": 473860597.0, + "step": 13006 + }, + { + "epoch": 2.4154131847725164, + "grad_norm": 1.5943042039871216, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.891521155834198, + "num_tokens": 473898335.0, + "step": 13007 + }, + { + "epoch": 2.4155988857938717, + "grad_norm": 1.719813585281372, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8977531790733337, + "num_tokens": 473933400.0, + "step": 13008 + }, + { + "epoch": 2.4157845868152275, + "grad_norm": 1.6708109378814697, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8931803703308105, + "num_tokens": 473968410.0, + "step": 13009 + }, + { + "epoch": 2.415970287836583, + "grad_norm": 1.7860198020935059, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8901068568229675, + "num_tokens": 474000619.0, + "step": 13010 + }, + { + "epoch": 2.4161559888579385, + "grad_norm": 1.7367082834243774, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.878122091293335, + "num_tokens": 474033901.0, + "step": 13011 + }, + { + "epoch": 2.416341689879294, + "grad_norm": 1.6294145584106445, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8979061841964722, + "num_tokens": 474073240.0, + "step": 13012 + }, + { + "epoch": 2.41652739090065, + "grad_norm": 1.7231067419052124, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.884655237197876, + "num_tokens": 474107592.0, + "step": 13013 + }, + { + "epoch": 2.4167130919220057, + "grad_norm": 1.4881956577301025, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8855736255645752, + "num_tokens": 474147804.0, + "step": 13014 + }, + { + "epoch": 2.4168987929433614, + "grad_norm": 1.7003629207611084, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8818610906600952, + "num_tokens": 474185750.0, + "step": 13015 + }, + { + "epoch": 2.4170844939647167, + "grad_norm": 1.5420259237289429, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8924007415771484, + "num_tokens": 474222181.0, + "step": 13016 + }, + { + "epoch": 2.4172701949860724, + "grad_norm": 1.5737723112106323, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.890656590461731, + "num_tokens": 474259221.0, + "step": 13017 + }, + { + "epoch": 2.417455896007428, + "grad_norm": 1.58128023147583, + "learning_rate": 1e-06, + "loss": 0.2787, + "mean_token_accuracy": 0.898429811000824, + "num_tokens": 474292598.0, + "step": 13018 + }, + { + "epoch": 2.4176415970287835, + "grad_norm": 1.5107487440109253, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8927186131477356, + "num_tokens": 474330191.0, + "step": 13019 + }, + { + "epoch": 2.417827298050139, + "grad_norm": 1.504958987236023, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.8987045288085938, + "num_tokens": 474368806.0, + "step": 13020 + }, + { + "epoch": 2.418012999071495, + "grad_norm": 1.5547550916671753, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8882256746292114, + "num_tokens": 474406401.0, + "step": 13021 + }, + { + "epoch": 2.4181987000928507, + "grad_norm": 1.569518804550171, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8943952322006226, + "num_tokens": 474442937.0, + "step": 13022 + }, + { + "epoch": 2.418384401114206, + "grad_norm": 1.6541012525558472, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8837270736694336, + "num_tokens": 474478066.0, + "step": 13023 + }, + { + "epoch": 2.4185701021355617, + "grad_norm": 1.6058930158615112, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.890615701675415, + "num_tokens": 474514544.0, + "step": 13024 + }, + { + "epoch": 2.4187558031569174, + "grad_norm": 1.4675827026367188, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8995956182479858, + "num_tokens": 474552559.0, + "step": 13025 + }, + { + "epoch": 2.418941504178273, + "grad_norm": 1.6039631366729736, + "learning_rate": 1e-06, + "loss": 0.2707, + "mean_token_accuracy": 0.8991792798042297, + "num_tokens": 474586733.0, + "step": 13026 + }, + { + "epoch": 2.4191272051996284, + "grad_norm": 1.6600812673568726, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8878746032714844, + "num_tokens": 474623535.0, + "step": 13027 + }, + { + "epoch": 2.419312906220984, + "grad_norm": 1.624471664428711, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8920549154281616, + "num_tokens": 474661736.0, + "step": 13028 + }, + { + "epoch": 2.41949860724234, + "grad_norm": 1.6242060661315918, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8874408006668091, + "num_tokens": 474697257.0, + "step": 13029 + }, + { + "epoch": 2.4196843082636956, + "grad_norm": 1.6146132946014404, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.884354293346405, + "num_tokens": 474734161.0, + "step": 13030 + }, + { + "epoch": 2.419870009285051, + "grad_norm": 1.4507991075515747, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8957851529121399, + "num_tokens": 474775792.0, + "step": 13031 + }, + { + "epoch": 2.4200557103064066, + "grad_norm": 1.6300692558288574, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8798061609268188, + "num_tokens": 474812432.0, + "step": 13032 + }, + { + "epoch": 2.4202414113277624, + "grad_norm": 1.6229037046432495, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8926091194152832, + "num_tokens": 474845268.0, + "step": 13033 + }, + { + "epoch": 2.4204271123491177, + "grad_norm": 1.5941588878631592, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8844074010848999, + "num_tokens": 474884608.0, + "step": 13034 + }, + { + "epoch": 2.4206128133704734, + "grad_norm": 1.6899234056472778, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8853614926338196, + "num_tokens": 474918346.0, + "step": 13035 + }, + { + "epoch": 2.420798514391829, + "grad_norm": 1.644088625907898, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8856330513954163, + "num_tokens": 474954725.0, + "step": 13036 + }, + { + "epoch": 2.420984215413185, + "grad_norm": 1.7020121812820435, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8739507794380188, + "num_tokens": 474992497.0, + "step": 13037 + }, + { + "epoch": 2.4211699164345406, + "grad_norm": 1.5055338144302368, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8958606719970703, + "num_tokens": 475031128.0, + "step": 13038 + }, + { + "epoch": 2.421355617455896, + "grad_norm": 1.5920339822769165, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.89753657579422, + "num_tokens": 475067632.0, + "step": 13039 + }, + { + "epoch": 2.4215413184772516, + "grad_norm": 1.9270403385162354, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8785861730575562, + "num_tokens": 475097444.0, + "step": 13040 + }, + { + "epoch": 2.4217270194986074, + "grad_norm": 1.6489672660827637, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8941662311553955, + "num_tokens": 475132890.0, + "step": 13041 + }, + { + "epoch": 2.4219127205199626, + "grad_norm": 1.558506727218628, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8961796760559082, + "num_tokens": 475170037.0, + "step": 13042 + }, + { + "epoch": 2.4220984215413184, + "grad_norm": 1.5443195104599, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8836789727210999, + "num_tokens": 475209255.0, + "step": 13043 + }, + { + "epoch": 2.422284122562674, + "grad_norm": 1.7093122005462646, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8688187003135681, + "num_tokens": 475245870.0, + "step": 13044 + }, + { + "epoch": 2.42246982358403, + "grad_norm": 1.6133211851119995, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8754327893257141, + "num_tokens": 475284887.0, + "step": 13045 + }, + { + "epoch": 2.422655524605385, + "grad_norm": 1.7241848707199097, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8904412388801575, + "num_tokens": 475316355.0, + "step": 13046 + }, + { + "epoch": 2.422841225626741, + "grad_norm": 1.412572979927063, + "learning_rate": 1e-06, + "loss": 0.2766, + "mean_token_accuracy": 0.9000227451324463, + "num_tokens": 475357058.0, + "step": 13047 + }, + { + "epoch": 2.4230269266480966, + "grad_norm": 1.5202932357788086, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8935927152633667, + "num_tokens": 475394312.0, + "step": 13048 + }, + { + "epoch": 2.4232126276694523, + "grad_norm": 1.6961416006088257, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8768200278282166, + "num_tokens": 475429158.0, + "step": 13049 + }, + { + "epoch": 2.4233983286908076, + "grad_norm": 1.564751386642456, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8929954767227173, + "num_tokens": 475462906.0, + "step": 13050 + }, + { + "epoch": 2.4235840297121634, + "grad_norm": 1.6892789602279663, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8765074014663696, + "num_tokens": 475499025.0, + "step": 13051 + }, + { + "epoch": 2.423769730733519, + "grad_norm": 1.5712403059005737, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8840846419334412, + "num_tokens": 475537073.0, + "step": 13052 + }, + { + "epoch": 2.423955431754875, + "grad_norm": 1.6544967889785767, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.882012128829956, + "num_tokens": 475573420.0, + "step": 13053 + }, + { + "epoch": 2.42414113277623, + "grad_norm": 1.6138304471969604, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8889851570129395, + "num_tokens": 475608842.0, + "step": 13054 + }, + { + "epoch": 2.424326833797586, + "grad_norm": 1.4805653095245361, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8927288055419922, + "num_tokens": 475649295.0, + "step": 13055 + }, + { + "epoch": 2.4245125348189416, + "grad_norm": 1.602154016494751, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8904857635498047, + "num_tokens": 475680017.0, + "step": 13056 + }, + { + "epoch": 2.4246982358402973, + "grad_norm": 1.6495991945266724, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8846353888511658, + "num_tokens": 475711793.0, + "step": 13057 + }, + { + "epoch": 2.4248839368616526, + "grad_norm": 1.4409269094467163, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8765003085136414, + "num_tokens": 475754219.0, + "step": 13058 + }, + { + "epoch": 2.4250696378830083, + "grad_norm": 1.541412115097046, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8851781487464905, + "num_tokens": 475794632.0, + "step": 13059 + }, + { + "epoch": 2.425255338904364, + "grad_norm": 1.6073349714279175, + "learning_rate": 1e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.8968531489372253, + "num_tokens": 475829817.0, + "step": 13060 + }, + { + "epoch": 2.42544103992572, + "grad_norm": 1.8518295288085938, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8909846544265747, + "num_tokens": 475862903.0, + "step": 13061 + }, + { + "epoch": 2.425626740947075, + "grad_norm": 1.5731382369995117, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8821353912353516, + "num_tokens": 475898591.0, + "step": 13062 + }, + { + "epoch": 2.425812441968431, + "grad_norm": 1.5488784313201904, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8878141641616821, + "num_tokens": 475938289.0, + "step": 13063 + }, + { + "epoch": 2.4259981429897866, + "grad_norm": 1.648992896080017, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8954501152038574, + "num_tokens": 475974740.0, + "step": 13064 + }, + { + "epoch": 2.426183844011142, + "grad_norm": 1.5715746879577637, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8873350620269775, + "num_tokens": 476013509.0, + "step": 13065 + }, + { + "epoch": 2.4263695450324976, + "grad_norm": 1.6418514251708984, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.891419529914856, + "num_tokens": 476049945.0, + "step": 13066 + }, + { + "epoch": 2.4265552460538533, + "grad_norm": 1.569928765296936, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8867780566215515, + "num_tokens": 476089628.0, + "step": 13067 + }, + { + "epoch": 2.426740947075209, + "grad_norm": 1.6939489841461182, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8827493190765381, + "num_tokens": 476121876.0, + "step": 13068 + }, + { + "epoch": 2.4269266480965648, + "grad_norm": 1.4869740009307861, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8754413723945618, + "num_tokens": 476167998.0, + "step": 13069 + }, + { + "epoch": 2.42711234911792, + "grad_norm": 1.6067285537719727, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8855857849121094, + "num_tokens": 476202555.0, + "step": 13070 + }, + { + "epoch": 2.427298050139276, + "grad_norm": 1.6027960777282715, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8728512525558472, + "num_tokens": 476239856.0, + "step": 13071 + }, + { + "epoch": 2.4274837511606315, + "grad_norm": 1.653781533241272, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8876492381095886, + "num_tokens": 476279071.0, + "step": 13072 + }, + { + "epoch": 2.427669452181987, + "grad_norm": 1.5391536951065063, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8893778324127197, + "num_tokens": 476320413.0, + "step": 13073 + }, + { + "epoch": 2.4278551532033426, + "grad_norm": 1.5607078075408936, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.881973147392273, + "num_tokens": 476358513.0, + "step": 13074 + }, + { + "epoch": 2.4280408542246983, + "grad_norm": 1.5354845523834229, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.875812828540802, + "num_tokens": 476400479.0, + "step": 13075 + }, + { + "epoch": 2.428226555246054, + "grad_norm": 1.5840034484863281, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.893758237361908, + "num_tokens": 476437063.0, + "step": 13076 + }, + { + "epoch": 2.4284122562674093, + "grad_norm": 1.6613517999649048, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.8981728553771973, + "num_tokens": 476471530.0, + "step": 13077 + }, + { + "epoch": 2.428597957288765, + "grad_norm": 1.6652660369873047, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.882610559463501, + "num_tokens": 476508053.0, + "step": 13078 + }, + { + "epoch": 2.4287836583101208, + "grad_norm": 1.560983657836914, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8941271305084229, + "num_tokens": 476543167.0, + "step": 13079 + }, + { + "epoch": 2.4289693593314765, + "grad_norm": 1.7425395250320435, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8758665919303894, + "num_tokens": 476578304.0, + "step": 13080 + }, + { + "epoch": 2.429155060352832, + "grad_norm": 1.5424550771713257, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.890752375125885, + "num_tokens": 476614613.0, + "step": 13081 + }, + { + "epoch": 2.4293407613741875, + "grad_norm": 1.6724351644515991, + "learning_rate": 1e-06, + "loss": 0.2766, + "mean_token_accuracy": 0.8992950320243835, + "num_tokens": 476643920.0, + "step": 13082 + }, + { + "epoch": 2.4295264623955433, + "grad_norm": 1.5426324605941772, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8814054727554321, + "num_tokens": 476682573.0, + "step": 13083 + }, + { + "epoch": 2.429712163416899, + "grad_norm": 1.6494320631027222, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.879185140132904, + "num_tokens": 476726049.0, + "step": 13084 + }, + { + "epoch": 2.4298978644382543, + "grad_norm": 1.6237807273864746, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8951225876808167, + "num_tokens": 476760191.0, + "step": 13085 + }, + { + "epoch": 2.43008356545961, + "grad_norm": 1.6950384378433228, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.894424557685852, + "num_tokens": 476790908.0, + "step": 13086 + }, + { + "epoch": 2.4302692664809658, + "grad_norm": 1.6188242435455322, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8937049508094788, + "num_tokens": 476824713.0, + "step": 13087 + }, + { + "epoch": 2.430454967502321, + "grad_norm": 1.5955729484558105, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8929522037506104, + "num_tokens": 476860483.0, + "step": 13088 + }, + { + "epoch": 2.4306406685236768, + "grad_norm": 1.6027804613113403, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8949966430664062, + "num_tokens": 476893828.0, + "step": 13089 + }, + { + "epoch": 2.4308263695450325, + "grad_norm": 1.5126689672470093, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8926011323928833, + "num_tokens": 476932660.0, + "step": 13090 + }, + { + "epoch": 2.4310120705663882, + "grad_norm": 1.4520988464355469, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8811583518981934, + "num_tokens": 476978850.0, + "step": 13091 + }, + { + "epoch": 2.431197771587744, + "grad_norm": 1.7275707721710205, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8934273719787598, + "num_tokens": 477010532.0, + "step": 13092 + }, + { + "epoch": 2.4313834726090993, + "grad_norm": 1.6500012874603271, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8792603015899658, + "num_tokens": 477048453.0, + "step": 13093 + }, + { + "epoch": 2.431569173630455, + "grad_norm": 1.7243589162826538, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8918625116348267, + "num_tokens": 477081823.0, + "step": 13094 + }, + { + "epoch": 2.4317548746518107, + "grad_norm": 1.5857717990875244, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8817086815834045, + "num_tokens": 477117957.0, + "step": 13095 + }, + { + "epoch": 2.431940575673166, + "grad_norm": 1.5760666131973267, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8870774507522583, + "num_tokens": 477151851.0, + "step": 13096 + }, + { + "epoch": 2.4321262766945217, + "grad_norm": 1.5874930620193481, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8723897933959961, + "num_tokens": 477190848.0, + "step": 13097 + }, + { + "epoch": 2.4323119777158775, + "grad_norm": 1.5493518114089966, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8878587484359741, + "num_tokens": 477230335.0, + "step": 13098 + }, + { + "epoch": 2.432497678737233, + "grad_norm": 1.7457178831100464, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.874575138092041, + "num_tokens": 477264985.0, + "step": 13099 + }, + { + "epoch": 2.4326833797585885, + "grad_norm": 1.5454671382904053, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8914608955383301, + "num_tokens": 477304972.0, + "step": 13100 + }, + { + "epoch": 2.4328690807799442, + "grad_norm": 1.7135173082351685, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8721777200698853, + "num_tokens": 477338880.0, + "step": 13101 + }, + { + "epoch": 2.4330547818013, + "grad_norm": 1.9059728384017944, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8788580298423767, + "num_tokens": 477369551.0, + "step": 13102 + }, + { + "epoch": 2.4332404828226557, + "grad_norm": 1.5639241933822632, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.882956862449646, + "num_tokens": 477408005.0, + "step": 13103 + }, + { + "epoch": 2.433426183844011, + "grad_norm": 1.5424407720565796, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8967412710189819, + "num_tokens": 477448467.0, + "step": 13104 + }, + { + "epoch": 2.4336118848653667, + "grad_norm": 1.6023736000061035, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8926384449005127, + "num_tokens": 477487419.0, + "step": 13105 + }, + { + "epoch": 2.4337975858867225, + "grad_norm": 1.7225348949432373, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8893567323684692, + "num_tokens": 477518657.0, + "step": 13106 + }, + { + "epoch": 2.433983286908078, + "grad_norm": 1.6900558471679688, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8830930590629578, + "num_tokens": 477557398.0, + "step": 13107 + }, + { + "epoch": 2.4341689879294335, + "grad_norm": 1.5347702503204346, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8953179717063904, + "num_tokens": 477595434.0, + "step": 13108 + }, + { + "epoch": 2.434354688950789, + "grad_norm": 1.666857123374939, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8909687399864197, + "num_tokens": 477628510.0, + "step": 13109 + }, + { + "epoch": 2.434540389972145, + "grad_norm": 1.8105924129486084, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8985892534255981, + "num_tokens": 477656489.0, + "step": 13110 + }, + { + "epoch": 2.4347260909935002, + "grad_norm": 1.6850978136062622, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8726881742477417, + "num_tokens": 477693902.0, + "step": 13111 + }, + { + "epoch": 2.434911792014856, + "grad_norm": 1.8107776641845703, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.887272834777832, + "num_tokens": 477725421.0, + "step": 13112 + }, + { + "epoch": 2.4350974930362117, + "grad_norm": 1.506831169128418, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8979727625846863, + "num_tokens": 477766151.0, + "step": 13113 + }, + { + "epoch": 2.4352831940575674, + "grad_norm": 1.5853222608566284, + "learning_rate": 1e-06, + "loss": 0.2677, + "mean_token_accuracy": 0.9023674726486206, + "num_tokens": 477798510.0, + "step": 13114 + }, + { + "epoch": 2.435468895078923, + "grad_norm": 1.702264428138733, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8823137879371643, + "num_tokens": 477830443.0, + "step": 13115 + }, + { + "epoch": 2.4356545961002785, + "grad_norm": 1.5770989656448364, + "learning_rate": 1e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9001099467277527, + "num_tokens": 477867110.0, + "step": 13116 + }, + { + "epoch": 2.435840297121634, + "grad_norm": 1.6651418209075928, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.873488187789917, + "num_tokens": 477900906.0, + "step": 13117 + }, + { + "epoch": 2.43602599814299, + "grad_norm": 1.5984759330749512, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8897314071655273, + "num_tokens": 477940213.0, + "step": 13118 + }, + { + "epoch": 2.436211699164345, + "grad_norm": 1.5013244152069092, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8868434429168701, + "num_tokens": 477977804.0, + "step": 13119 + }, + { + "epoch": 2.436397400185701, + "grad_norm": 1.4980361461639404, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8929908871650696, + "num_tokens": 478013703.0, + "step": 13120 + }, + { + "epoch": 2.4365831012070567, + "grad_norm": 1.5961898565292358, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8874393701553345, + "num_tokens": 478050978.0, + "step": 13121 + }, + { + "epoch": 2.4367688022284124, + "grad_norm": 1.4707629680633545, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8915772438049316, + "num_tokens": 478094935.0, + "step": 13122 + }, + { + "epoch": 2.4369545032497677, + "grad_norm": 1.6448689699172974, + "learning_rate": 1e-06, + "loss": 0.2588, + "mean_token_accuracy": 0.9056544303894043, + "num_tokens": 478127976.0, + "step": 13123 + }, + { + "epoch": 2.4371402042711234, + "grad_norm": 1.5077568292617798, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8839285373687744, + "num_tokens": 478166541.0, + "step": 13124 + }, + { + "epoch": 2.437325905292479, + "grad_norm": 1.738642692565918, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.874265193939209, + "num_tokens": 478200920.0, + "step": 13125 + }, + { + "epoch": 2.437511606313835, + "grad_norm": 1.6436058282852173, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8777353167533875, + "num_tokens": 478239137.0, + "step": 13126 + }, + { + "epoch": 2.43769730733519, + "grad_norm": 1.8107296228408813, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8719171285629272, + "num_tokens": 478273218.0, + "step": 13127 + }, + { + "epoch": 2.437883008356546, + "grad_norm": 1.5809134244918823, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8877396583557129, + "num_tokens": 478309051.0, + "step": 13128 + }, + { + "epoch": 2.4380687093779017, + "grad_norm": 1.7203211784362793, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.89197838306427, + "num_tokens": 478342094.0, + "step": 13129 + }, + { + "epoch": 2.4382544103992574, + "grad_norm": 1.662202000617981, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8855663537979126, + "num_tokens": 478378546.0, + "step": 13130 + }, + { + "epoch": 2.4384401114206127, + "grad_norm": 1.562373399734497, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8799094557762146, + "num_tokens": 478417456.0, + "step": 13131 + }, + { + "epoch": 2.4386258124419684, + "grad_norm": 1.8650912046432495, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.881367564201355, + "num_tokens": 478448317.0, + "step": 13132 + }, + { + "epoch": 2.438811513463324, + "grad_norm": 1.661091923713684, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8845564126968384, + "num_tokens": 478484722.0, + "step": 13133 + }, + { + "epoch": 2.4389972144846794, + "grad_norm": 1.722227692604065, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.865078330039978, + "num_tokens": 478520333.0, + "step": 13134 + }, + { + "epoch": 2.439182915506035, + "grad_norm": 1.5559567213058472, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8771296143531799, + "num_tokens": 478558927.0, + "step": 13135 + }, + { + "epoch": 2.439368616527391, + "grad_norm": 1.6385308504104614, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8951173424720764, + "num_tokens": 478590240.0, + "step": 13136 + }, + { + "epoch": 2.4395543175487466, + "grad_norm": 1.5592900514602661, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8998463153839111, + "num_tokens": 478626877.0, + "step": 13137 + }, + { + "epoch": 2.4397400185701024, + "grad_norm": 1.5113639831542969, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8883771300315857, + "num_tokens": 478670148.0, + "step": 13138 + }, + { + "epoch": 2.4399257195914577, + "grad_norm": 1.6527553796768188, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8926635980606079, + "num_tokens": 478706550.0, + "step": 13139 + }, + { + "epoch": 2.4401114206128134, + "grad_norm": 1.6271950006484985, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8828076720237732, + "num_tokens": 478742243.0, + "step": 13140 + }, + { + "epoch": 2.440297121634169, + "grad_norm": 1.4847334623336792, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8974012732505798, + "num_tokens": 478782954.0, + "step": 13141 + }, + { + "epoch": 2.4404828226555244, + "grad_norm": 1.5794129371643066, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8934099078178406, + "num_tokens": 478819568.0, + "step": 13142 + }, + { + "epoch": 2.44066852367688, + "grad_norm": 1.8098169565200806, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8830094337463379, + "num_tokens": 478850765.0, + "step": 13143 + }, + { + "epoch": 2.440854224698236, + "grad_norm": 1.6397491693496704, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8764863014221191, + "num_tokens": 478889685.0, + "step": 13144 + }, + { + "epoch": 2.4410399257195916, + "grad_norm": 1.6493074893951416, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8927803039550781, + "num_tokens": 478921622.0, + "step": 13145 + }, + { + "epoch": 2.441225626740947, + "grad_norm": 1.7783602476119995, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8830756545066833, + "num_tokens": 478955202.0, + "step": 13146 + }, + { + "epoch": 2.4414113277623026, + "grad_norm": 1.692859172821045, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8834477663040161, + "num_tokens": 478992295.0, + "step": 13147 + }, + { + "epoch": 2.4415970287836584, + "grad_norm": 1.6478482484817505, + "learning_rate": 1e-06, + "loss": 0.2613, + "mean_token_accuracy": 0.9035013318061829, + "num_tokens": 479032602.0, + "step": 13148 + }, + { + "epoch": 2.441782729805014, + "grad_norm": 1.642961025238037, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8935361504554749, + "num_tokens": 479065622.0, + "step": 13149 + }, + { + "epoch": 2.4419684308263694, + "grad_norm": 1.5857735872268677, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8880311846733093, + "num_tokens": 479102082.0, + "step": 13150 + }, + { + "epoch": 2.442154131847725, + "grad_norm": 1.540744423866272, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8961905837059021, + "num_tokens": 479137854.0, + "step": 13151 + }, + { + "epoch": 2.442339832869081, + "grad_norm": 1.5562453269958496, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8852688670158386, + "num_tokens": 479179639.0, + "step": 13152 + }, + { + "epoch": 2.4425255338904366, + "grad_norm": 1.4915947914123535, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8935133218765259, + "num_tokens": 479218483.0, + "step": 13153 + }, + { + "epoch": 2.442711234911792, + "grad_norm": 1.6139483451843262, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8903161883354187, + "num_tokens": 479253870.0, + "step": 13154 + }, + { + "epoch": 2.4428969359331476, + "grad_norm": 1.7750409841537476, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8715579509735107, + "num_tokens": 479297800.0, + "step": 13155 + }, + { + "epoch": 2.4430826369545033, + "grad_norm": 1.6329621076583862, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8933486938476562, + "num_tokens": 479330802.0, + "step": 13156 + }, + { + "epoch": 2.4432683379758586, + "grad_norm": 1.8694250583648682, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8754003047943115, + "num_tokens": 479365646.0, + "step": 13157 + }, + { + "epoch": 2.4434540389972144, + "grad_norm": 1.807747721672058, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8780490756034851, + "num_tokens": 479399871.0, + "step": 13158 + }, + { + "epoch": 2.44363974001857, + "grad_norm": 1.657062292098999, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8700131177902222, + "num_tokens": 479442065.0, + "step": 13159 + }, + { + "epoch": 2.443825441039926, + "grad_norm": 1.5964696407318115, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8779129981994629, + "num_tokens": 479483166.0, + "step": 13160 + }, + { + "epoch": 2.4440111420612816, + "grad_norm": 1.5891995429992676, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8933931589126587, + "num_tokens": 479522558.0, + "step": 13161 + }, + { + "epoch": 2.444196843082637, + "grad_norm": 1.7154463529586792, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8744504451751709, + "num_tokens": 479563053.0, + "step": 13162 + }, + { + "epoch": 2.4443825441039926, + "grad_norm": 1.5646262168884277, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8838071227073669, + "num_tokens": 479602140.0, + "step": 13163 + }, + { + "epoch": 2.4445682451253483, + "grad_norm": 1.5283643007278442, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8791453838348389, + "num_tokens": 479644019.0, + "step": 13164 + }, + { + "epoch": 2.4447539461467036, + "grad_norm": 1.56059992313385, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8880568742752075, + "num_tokens": 479680193.0, + "step": 13165 + }, + { + "epoch": 2.4449396471680593, + "grad_norm": 1.5807032585144043, + "learning_rate": 1e-06, + "loss": 0.2566, + "mean_token_accuracy": 0.9065039157867432, + "num_tokens": 479712819.0, + "step": 13166 + }, + { + "epoch": 2.445125348189415, + "grad_norm": 1.5306614637374878, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8919174671173096, + "num_tokens": 479752575.0, + "step": 13167 + }, + { + "epoch": 2.445311049210771, + "grad_norm": 1.682246446609497, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8755860328674316, + "num_tokens": 479792948.0, + "step": 13168 + }, + { + "epoch": 2.445496750232126, + "grad_norm": 1.5751855373382568, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8794043064117432, + "num_tokens": 479832147.0, + "step": 13169 + }, + { + "epoch": 2.445682451253482, + "grad_norm": 1.588218331336975, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8792822360992432, + "num_tokens": 479871411.0, + "step": 13170 + }, + { + "epoch": 2.4458681522748376, + "grad_norm": 1.5258829593658447, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8967974185943604, + "num_tokens": 479910512.0, + "step": 13171 + }, + { + "epoch": 2.4460538532961933, + "grad_norm": 1.573472023010254, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8805521726608276, + "num_tokens": 479949261.0, + "step": 13172 + }, + { + "epoch": 2.4462395543175486, + "grad_norm": 1.7138123512268066, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8875815272331238, + "num_tokens": 479984717.0, + "step": 13173 + }, + { + "epoch": 2.4464252553389043, + "grad_norm": 1.8215895891189575, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8783822059631348, + "num_tokens": 480021442.0, + "step": 13174 + }, + { + "epoch": 2.44661095636026, + "grad_norm": 1.5262197256088257, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8910638093948364, + "num_tokens": 480060246.0, + "step": 13175 + }, + { + "epoch": 2.446796657381616, + "grad_norm": 1.8414807319641113, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8737795352935791, + "num_tokens": 480092960.0, + "step": 13176 + }, + { + "epoch": 2.446982358402971, + "grad_norm": 1.5159980058670044, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8921194672584534, + "num_tokens": 480134769.0, + "step": 13177 + }, + { + "epoch": 2.447168059424327, + "grad_norm": 1.5425444841384888, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8940293788909912, + "num_tokens": 480174266.0, + "step": 13178 + }, + { + "epoch": 2.4473537604456825, + "grad_norm": 1.7281080484390259, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8959091305732727, + "num_tokens": 480206129.0, + "step": 13179 + }, + { + "epoch": 2.447539461467038, + "grad_norm": 1.7363581657409668, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8909732103347778, + "num_tokens": 480234533.0, + "step": 13180 + }, + { + "epoch": 2.4477251624883936, + "grad_norm": 1.5933339595794678, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8802345395088196, + "num_tokens": 480272441.0, + "step": 13181 + }, + { + "epoch": 2.4479108635097493, + "grad_norm": 1.6231006383895874, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8800440430641174, + "num_tokens": 480309392.0, + "step": 13182 + }, + { + "epoch": 2.448096564531105, + "grad_norm": 1.530084490776062, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.9041450023651123, + "num_tokens": 480342527.0, + "step": 13183 + }, + { + "epoch": 2.4482822655524608, + "grad_norm": 1.6036514043807983, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8778772950172424, + "num_tokens": 480385772.0, + "step": 13184 + }, + { + "epoch": 2.448467966573816, + "grad_norm": 1.640160322189331, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.874049186706543, + "num_tokens": 480423266.0, + "step": 13185 + }, + { + "epoch": 2.448653667595172, + "grad_norm": 1.4765946865081787, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8850312829017639, + "num_tokens": 480466565.0, + "step": 13186 + }, + { + "epoch": 2.4488393686165275, + "grad_norm": 1.535047173500061, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8944625854492188, + "num_tokens": 480507460.0, + "step": 13187 + }, + { + "epoch": 2.449025069637883, + "grad_norm": 1.715745449066162, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8859584331512451, + "num_tokens": 480542914.0, + "step": 13188 + }, + { + "epoch": 2.4492107706592385, + "grad_norm": 1.607679009437561, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8840046525001526, + "num_tokens": 480577699.0, + "step": 13189 + }, + { + "epoch": 2.4493964716805943, + "grad_norm": 1.7927231788635254, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8819186687469482, + "num_tokens": 480609299.0, + "step": 13190 + }, + { + "epoch": 2.44958217270195, + "grad_norm": 1.642401933670044, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8901045322418213, + "num_tokens": 480642181.0, + "step": 13191 + }, + { + "epoch": 2.4497678737233053, + "grad_norm": 1.5350358486175537, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8884530067443848, + "num_tokens": 480682736.0, + "step": 13192 + }, + { + "epoch": 2.449953574744661, + "grad_norm": 1.5086880922317505, + "learning_rate": 1e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.9007713198661804, + "num_tokens": 480722616.0, + "step": 13193 + }, + { + "epoch": 2.4501392757660168, + "grad_norm": 1.6371338367462158, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8942850828170776, + "num_tokens": 480757978.0, + "step": 13194 + }, + { + "epoch": 2.4503249767873725, + "grad_norm": 1.5156527757644653, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8944090008735657, + "num_tokens": 480796650.0, + "step": 13195 + }, + { + "epoch": 2.4505106778087278, + "grad_norm": 1.683435320854187, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8933941125869751, + "num_tokens": 480828395.0, + "step": 13196 + }, + { + "epoch": 2.4506963788300835, + "grad_norm": 1.5819084644317627, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8870585560798645, + "num_tokens": 480866303.0, + "step": 13197 + }, + { + "epoch": 2.4508820798514392, + "grad_norm": 1.575614333152771, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8939828276634216, + "num_tokens": 480901974.0, + "step": 13198 + }, + { + "epoch": 2.451067780872795, + "grad_norm": 1.6629759073257446, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8865461945533752, + "num_tokens": 480937758.0, + "step": 13199 + }, + { + "epoch": 2.4512534818941503, + "grad_norm": 1.4680800437927246, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.8992211818695068, + "num_tokens": 480978543.0, + "step": 13200 + }, + { + "epoch": 2.451439182915506, + "grad_norm": 1.6948236227035522, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8676161766052246, + "num_tokens": 481016902.0, + "step": 13201 + }, + { + "epoch": 2.4516248839368617, + "grad_norm": 1.670433521270752, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8643200397491455, + "num_tokens": 481053017.0, + "step": 13202 + }, + { + "epoch": 2.451810584958217, + "grad_norm": 1.724130392074585, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8816593885421753, + "num_tokens": 481084770.0, + "step": 13203 + }, + { + "epoch": 2.4519962859795728, + "grad_norm": 1.5249422788619995, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8929629325866699, + "num_tokens": 481121010.0, + "step": 13204 + }, + { + "epoch": 2.4521819870009285, + "grad_norm": 1.6105912923812866, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8832609057426453, + "num_tokens": 481160033.0, + "step": 13205 + }, + { + "epoch": 2.452367688022284, + "grad_norm": 1.4770052433013916, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.893602728843689, + "num_tokens": 481200805.0, + "step": 13206 + }, + { + "epoch": 2.45255338904364, + "grad_norm": 1.6427603960037231, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8765002489089966, + "num_tokens": 481239217.0, + "step": 13207 + }, + { + "epoch": 2.4527390900649952, + "grad_norm": 1.693997859954834, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8890827894210815, + "num_tokens": 481277360.0, + "step": 13208 + }, + { + "epoch": 2.452924791086351, + "grad_norm": 1.5674126148223877, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8891876339912415, + "num_tokens": 481315162.0, + "step": 13209 + }, + { + "epoch": 2.4531104921077067, + "grad_norm": 1.6930615901947021, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8861504793167114, + "num_tokens": 481345428.0, + "step": 13210 + }, + { + "epoch": 2.453296193129062, + "grad_norm": 1.5152451992034912, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.893295168876648, + "num_tokens": 481386553.0, + "step": 13211 + }, + { + "epoch": 2.4534818941504177, + "grad_norm": 1.6158186197280884, + "learning_rate": 1e-06, + "loss": 0.2753, + "mean_token_accuracy": 0.8993351459503174, + "num_tokens": 481418271.0, + "step": 13212 + }, + { + "epoch": 2.4536675951717735, + "grad_norm": 1.713606834411621, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8926078677177429, + "num_tokens": 481450500.0, + "step": 13213 + }, + { + "epoch": 2.453853296193129, + "grad_norm": 1.5540136098861694, + "learning_rate": 1e-06, + "loss": 0.2664, + "mean_token_accuracy": 0.9017795324325562, + "num_tokens": 481488282.0, + "step": 13214 + }, + { + "epoch": 2.4540389972144845, + "grad_norm": 1.8221780061721802, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8815522193908691, + "num_tokens": 481520628.0, + "step": 13215 + }, + { + "epoch": 2.45422469823584, + "grad_norm": 1.5580490827560425, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.881174623966217, + "num_tokens": 481561880.0, + "step": 13216 + }, + { + "epoch": 2.454410399257196, + "grad_norm": 1.787293791770935, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8783730268478394, + "num_tokens": 481593733.0, + "step": 13217 + }, + { + "epoch": 2.4545961002785517, + "grad_norm": 1.5065386295318604, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8892303705215454, + "num_tokens": 481633150.0, + "step": 13218 + }, + { + "epoch": 2.454781801299907, + "grad_norm": 1.6049563884735107, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8791268467903137, + "num_tokens": 481671633.0, + "step": 13219 + }, + { + "epoch": 2.4549675023212627, + "grad_norm": 1.583526849746704, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8767611980438232, + "num_tokens": 481714159.0, + "step": 13220 + }, + { + "epoch": 2.4551532033426184, + "grad_norm": 1.5534679889678955, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8885627388954163, + "num_tokens": 481751460.0, + "step": 13221 + }, + { + "epoch": 2.455338904363974, + "grad_norm": 1.6808627843856812, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.894507884979248, + "num_tokens": 481788502.0, + "step": 13222 + }, + { + "epoch": 2.4555246053853295, + "grad_norm": 1.4767824411392212, + "learning_rate": 1e-06, + "loss": 0.255, + "mean_token_accuracy": 0.9065620303153992, + "num_tokens": 481826564.0, + "step": 13223 + }, + { + "epoch": 2.455710306406685, + "grad_norm": 1.5411643981933594, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9025306105613708, + "num_tokens": 481859401.0, + "step": 13224 + }, + { + "epoch": 2.455896007428041, + "grad_norm": 1.6347341537475586, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8965265154838562, + "num_tokens": 481895702.0, + "step": 13225 + }, + { + "epoch": 2.4560817084493967, + "grad_norm": 1.6437530517578125, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8938197493553162, + "num_tokens": 481929480.0, + "step": 13226 + }, + { + "epoch": 2.456267409470752, + "grad_norm": 1.6298283338546753, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8849747180938721, + "num_tokens": 481967452.0, + "step": 13227 + }, + { + "epoch": 2.4564531104921077, + "grad_norm": 1.6938365697860718, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8932371735572815, + "num_tokens": 481999190.0, + "step": 13228 + }, + { + "epoch": 2.4566388115134634, + "grad_norm": 1.5615441799163818, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.888187825679779, + "num_tokens": 482034824.0, + "step": 13229 + }, + { + "epoch": 2.456824512534819, + "grad_norm": 1.6882688999176025, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8745996952056885, + "num_tokens": 482070624.0, + "step": 13230 + }, + { + "epoch": 2.4570102135561744, + "grad_norm": 1.6538361310958862, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8931673169136047, + "num_tokens": 482104989.0, + "step": 13231 + }, + { + "epoch": 2.45719591457753, + "grad_norm": 1.7682905197143555, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8803443908691406, + "num_tokens": 482140239.0, + "step": 13232 + }, + { + "epoch": 2.457381615598886, + "grad_norm": 1.640757441520691, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8886449337005615, + "num_tokens": 482172943.0, + "step": 13233 + }, + { + "epoch": 2.457567316620241, + "grad_norm": 1.5173803567886353, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8950258493423462, + "num_tokens": 482210016.0, + "step": 13234 + }, + { + "epoch": 2.457753017641597, + "grad_norm": 1.6522393226623535, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8967106938362122, + "num_tokens": 482244043.0, + "step": 13235 + }, + { + "epoch": 2.4579387186629527, + "grad_norm": 1.7137844562530518, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8677908182144165, + "num_tokens": 482285489.0, + "step": 13236 + }, + { + "epoch": 2.4581244196843084, + "grad_norm": 1.5815497636795044, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8783648014068604, + "num_tokens": 482327087.0, + "step": 13237 + }, + { + "epoch": 2.458310120705664, + "grad_norm": 1.5621339082717896, + "learning_rate": 1e-06, + "loss": 0.2716, + "mean_token_accuracy": 0.9030839204788208, + "num_tokens": 482361946.0, + "step": 13238 + }, + { + "epoch": 2.4584958217270194, + "grad_norm": 1.5739587545394897, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8844947814941406, + "num_tokens": 482403861.0, + "step": 13239 + }, + { + "epoch": 2.458681522748375, + "grad_norm": 1.578076958656311, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8806357383728027, + "num_tokens": 482443584.0, + "step": 13240 + }, + { + "epoch": 2.458867223769731, + "grad_norm": 1.4595708847045898, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8892319202423096, + "num_tokens": 482487621.0, + "step": 13241 + }, + { + "epoch": 2.459052924791086, + "grad_norm": 1.6641247272491455, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8865514993667603, + "num_tokens": 482521678.0, + "step": 13242 + }, + { + "epoch": 2.459238625812442, + "grad_norm": 1.6697261333465576, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.883739709854126, + "num_tokens": 482557405.0, + "step": 13243 + }, + { + "epoch": 2.4594243268337976, + "grad_norm": 1.6778746843338013, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8660225868225098, + "num_tokens": 482596002.0, + "step": 13244 + }, + { + "epoch": 2.4596100278551534, + "grad_norm": 1.6217328310012817, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8771657347679138, + "num_tokens": 482632510.0, + "step": 13245 + }, + { + "epoch": 2.4597957288765087, + "grad_norm": 1.7474228143692017, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8900060653686523, + "num_tokens": 482661832.0, + "step": 13246 + }, + { + "epoch": 2.4599814298978644, + "grad_norm": 1.7346218824386597, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8827593326568604, + "num_tokens": 482695872.0, + "step": 13247 + }, + { + "epoch": 2.46016713091922, + "grad_norm": 1.50957453250885, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8835382461547852, + "num_tokens": 482733343.0, + "step": 13248 + }, + { + "epoch": 2.460352831940576, + "grad_norm": 1.5241520404815674, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8982576131820679, + "num_tokens": 482771702.0, + "step": 13249 + }, + { + "epoch": 2.460538532961931, + "grad_norm": 1.781815528869629, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8820559978485107, + "num_tokens": 482800870.0, + "step": 13250 + }, + { + "epoch": 2.460724233983287, + "grad_norm": 1.5529834032058716, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.885779082775116, + "num_tokens": 482835419.0, + "step": 13251 + }, + { + "epoch": 2.4609099350046426, + "grad_norm": 1.5178227424621582, + "learning_rate": 1e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9009050130844116, + "num_tokens": 482877313.0, + "step": 13252 + }, + { + "epoch": 2.4610956360259983, + "grad_norm": 1.696729302406311, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8734708428382874, + "num_tokens": 482913421.0, + "step": 13253 + }, + { + "epoch": 2.4612813370473536, + "grad_norm": 1.5004034042358398, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8873578310012817, + "num_tokens": 482954439.0, + "step": 13254 + }, + { + "epoch": 2.4614670380687094, + "grad_norm": 1.6597083806991577, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8806806802749634, + "num_tokens": 482988157.0, + "step": 13255 + }, + { + "epoch": 2.461652739090065, + "grad_norm": 1.7064638137817383, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8866006135940552, + "num_tokens": 483021892.0, + "step": 13256 + }, + { + "epoch": 2.4618384401114204, + "grad_norm": 1.6865774393081665, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8927649259567261, + "num_tokens": 483056900.0, + "step": 13257 + }, + { + "epoch": 2.462024141132776, + "grad_norm": 1.5131549835205078, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8929381370544434, + "num_tokens": 483095553.0, + "step": 13258 + }, + { + "epoch": 2.462209842154132, + "grad_norm": 1.7690064907073975, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8726670742034912, + "num_tokens": 483128752.0, + "step": 13259 + }, + { + "epoch": 2.4623955431754876, + "grad_norm": 1.6189600229263306, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8781937956809998, + "num_tokens": 483168377.0, + "step": 13260 + }, + { + "epoch": 2.4625812441968433, + "grad_norm": 1.7347335815429688, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9012913107872009, + "num_tokens": 483198337.0, + "step": 13261 + }, + { + "epoch": 2.4627669452181986, + "grad_norm": 1.803725004196167, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8747720122337341, + "num_tokens": 483233336.0, + "step": 13262 + }, + { + "epoch": 2.4629526462395543, + "grad_norm": 1.668176531791687, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8768796920776367, + "num_tokens": 483269146.0, + "step": 13263 + }, + { + "epoch": 2.46313834726091, + "grad_norm": 1.6539214849472046, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8946459889411926, + "num_tokens": 483303147.0, + "step": 13264 + }, + { + "epoch": 2.4633240482822654, + "grad_norm": 1.4684101343154907, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8881717920303345, + "num_tokens": 483344620.0, + "step": 13265 + }, + { + "epoch": 2.463509749303621, + "grad_norm": 1.613796353340149, + "learning_rate": 1e-06, + "loss": 0.265, + "mean_token_accuracy": 0.9047609567642212, + "num_tokens": 483381060.0, + "step": 13266 + }, + { + "epoch": 2.463695450324977, + "grad_norm": 1.6394480466842651, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8813295364379883, + "num_tokens": 483416777.0, + "step": 13267 + }, + { + "epoch": 2.4638811513463326, + "grad_norm": 1.6834135055541992, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8889461755752563, + "num_tokens": 483449325.0, + "step": 13268 + }, + { + "epoch": 2.464066852367688, + "grad_norm": 1.5483813285827637, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8814663290977478, + "num_tokens": 483489543.0, + "step": 13269 + }, + { + "epoch": 2.4642525533890436, + "grad_norm": 1.5452017784118652, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8932197093963623, + "num_tokens": 483527332.0, + "step": 13270 + }, + { + "epoch": 2.4644382544103993, + "grad_norm": 1.6576366424560547, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8845855593681335, + "num_tokens": 483563285.0, + "step": 13271 + }, + { + "epoch": 2.464623955431755, + "grad_norm": 1.6271337270736694, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8818372488021851, + "num_tokens": 483598399.0, + "step": 13272 + }, + { + "epoch": 2.4648096564531103, + "grad_norm": 1.6812889575958252, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8908613920211792, + "num_tokens": 483632269.0, + "step": 13273 + }, + { + "epoch": 2.464995357474466, + "grad_norm": 1.803508996963501, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.9019091129302979, + "num_tokens": 483660697.0, + "step": 13274 + }, + { + "epoch": 2.465181058495822, + "grad_norm": 1.4460761547088623, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8957057595252991, + "num_tokens": 483704451.0, + "step": 13275 + }, + { + "epoch": 2.4653667595171775, + "grad_norm": 1.628905177116394, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8840513825416565, + "num_tokens": 483745906.0, + "step": 13276 + }, + { + "epoch": 2.465552460538533, + "grad_norm": 1.5376137495040894, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8823089599609375, + "num_tokens": 483783131.0, + "step": 13277 + }, + { + "epoch": 2.4657381615598886, + "grad_norm": 1.5738922357559204, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8854692578315735, + "num_tokens": 483821753.0, + "step": 13278 + }, + { + "epoch": 2.4659238625812443, + "grad_norm": 1.8879413604736328, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8793008327484131, + "num_tokens": 483853594.0, + "step": 13279 + }, + { + "epoch": 2.4661095636025996, + "grad_norm": 1.6615852117538452, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8836477398872375, + "num_tokens": 483889417.0, + "step": 13280 + }, + { + "epoch": 2.4662952646239553, + "grad_norm": 1.6706808805465698, + "learning_rate": 1e-06, + "loss": 0.2613, + "mean_token_accuracy": 0.9026439189910889, + "num_tokens": 483921942.0, + "step": 13281 + }, + { + "epoch": 2.466480965645311, + "grad_norm": 1.6737021207809448, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.892852783203125, + "num_tokens": 483958762.0, + "step": 13282 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 1.6931240558624268, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8828662633895874, + "num_tokens": 483991569.0, + "step": 13283 + }, + { + "epoch": 2.4668523676880225, + "grad_norm": 1.5527024269104004, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.8980079889297485, + "num_tokens": 484026789.0, + "step": 13284 + }, + { + "epoch": 2.467038068709378, + "grad_norm": 1.48699152469635, + "learning_rate": 1e-06, + "loss": 0.2696, + "mean_token_accuracy": 0.9011707305908203, + "num_tokens": 484066707.0, + "step": 13285 + }, + { + "epoch": 2.4672237697307335, + "grad_norm": 1.631502389907837, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8738303780555725, + "num_tokens": 484106628.0, + "step": 13286 + }, + { + "epoch": 2.4674094707520893, + "grad_norm": 1.5474519729614258, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8887253999710083, + "num_tokens": 484149047.0, + "step": 13287 + }, + { + "epoch": 2.4675951717734446, + "grad_norm": 1.7664891481399536, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8890107870101929, + "num_tokens": 484180713.0, + "step": 13288 + }, + { + "epoch": 2.4677808727948003, + "grad_norm": 1.655898094177246, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8857454657554626, + "num_tokens": 484214598.0, + "step": 13289 + }, + { + "epoch": 2.467966573816156, + "grad_norm": 1.6336594820022583, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8757851719856262, + "num_tokens": 484250630.0, + "step": 13290 + }, + { + "epoch": 2.4681522748375118, + "grad_norm": 1.5207526683807373, + "learning_rate": 1e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.9056143164634705, + "num_tokens": 484286577.0, + "step": 13291 + }, + { + "epoch": 2.468337975858867, + "grad_norm": 1.6879162788391113, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8800731897354126, + "num_tokens": 484323593.0, + "step": 13292 + }, + { + "epoch": 2.468523676880223, + "grad_norm": 1.6453666687011719, + "learning_rate": 1e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.9011704325675964, + "num_tokens": 484354549.0, + "step": 13293 + }, + { + "epoch": 2.4687093779015785, + "grad_norm": 1.6995255947113037, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8688619136810303, + "num_tokens": 484394514.0, + "step": 13294 + }, + { + "epoch": 2.4688950789229342, + "grad_norm": 1.6209745407104492, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8753108382225037, + "num_tokens": 484430288.0, + "step": 13295 + }, + { + "epoch": 2.4690807799442895, + "grad_norm": 1.6662472486495972, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8826616406440735, + "num_tokens": 484467186.0, + "step": 13296 + }, + { + "epoch": 2.4692664809656453, + "grad_norm": 1.6869639158248901, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8826230764389038, + "num_tokens": 484500059.0, + "step": 13297 + }, + { + "epoch": 2.469452181987001, + "grad_norm": 1.6672985553741455, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8857215642929077, + "num_tokens": 484535348.0, + "step": 13298 + }, + { + "epoch": 2.4696378830083567, + "grad_norm": 1.6763943433761597, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8779261112213135, + "num_tokens": 484568556.0, + "step": 13299 + }, + { + "epoch": 2.469823584029712, + "grad_norm": 1.5894367694854736, + "learning_rate": 1e-06, + "loss": 0.2867, + "mean_token_accuracy": 0.8942016363143921, + "num_tokens": 484604746.0, + "step": 13300 + }, + { + "epoch": 2.4700092850510678, + "grad_norm": 1.6665542125701904, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8841699361801147, + "num_tokens": 484640951.0, + "step": 13301 + }, + { + "epoch": 2.4701949860724235, + "grad_norm": 1.575351357460022, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8874073028564453, + "num_tokens": 484675636.0, + "step": 13302 + }, + { + "epoch": 2.470380687093779, + "grad_norm": 1.8701127767562866, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8839362263679504, + "num_tokens": 484705590.0, + "step": 13303 + }, + { + "epoch": 2.4705663881151345, + "grad_norm": 1.5744096040725708, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8812885284423828, + "num_tokens": 484745075.0, + "step": 13304 + }, + { + "epoch": 2.4707520891364902, + "grad_norm": 1.5825449228286743, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8837734460830688, + "num_tokens": 484784444.0, + "step": 13305 + }, + { + "epoch": 2.470937790157846, + "grad_norm": 1.615966796875, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8906203508377075, + "num_tokens": 484822363.0, + "step": 13306 + }, + { + "epoch": 2.4711234911792017, + "grad_norm": 1.5295774936676025, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8798423409461975, + "num_tokens": 484859829.0, + "step": 13307 + }, + { + "epoch": 2.471309192200557, + "grad_norm": 1.4924343824386597, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.884114682674408, + "num_tokens": 484901057.0, + "step": 13308 + }, + { + "epoch": 2.4714948932219127, + "grad_norm": 1.6645883321762085, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8933619260787964, + "num_tokens": 484932966.0, + "step": 13309 + }, + { + "epoch": 2.4716805942432685, + "grad_norm": 1.5360890626907349, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8831878900527954, + "num_tokens": 484974119.0, + "step": 13310 + }, + { + "epoch": 2.4718662952646238, + "grad_norm": 1.6066112518310547, + "learning_rate": 1e-06, + "loss": 0.2643, + "mean_token_accuracy": 0.9033665657043457, + "num_tokens": 485007368.0, + "step": 13311 + }, + { + "epoch": 2.4720519962859795, + "grad_norm": 1.6736645698547363, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8800290822982788, + "num_tokens": 485043419.0, + "step": 13312 + }, + { + "epoch": 2.4722376973073352, + "grad_norm": 1.4153865575790405, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.891936182975769, + "num_tokens": 485088264.0, + "step": 13313 + }, + { + "epoch": 2.472423398328691, + "grad_norm": 1.5862822532653809, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8913535475730896, + "num_tokens": 485124442.0, + "step": 13314 + }, + { + "epoch": 2.4726090993500462, + "grad_norm": 1.6909043788909912, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8812399506568909, + "num_tokens": 485157298.0, + "step": 13315 + }, + { + "epoch": 2.472794800371402, + "grad_norm": 1.5007596015930176, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8967419862747192, + "num_tokens": 485197714.0, + "step": 13316 + }, + { + "epoch": 2.4729805013927577, + "grad_norm": 1.631259799003601, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8764172792434692, + "num_tokens": 485235597.0, + "step": 13317 + }, + { + "epoch": 2.4731662024141134, + "grad_norm": 1.6124192476272583, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8786693811416626, + "num_tokens": 485274831.0, + "step": 13318 + }, + { + "epoch": 2.4733519034354687, + "grad_norm": 1.666958212852478, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8892068266868591, + "num_tokens": 485311588.0, + "step": 13319 + }, + { + "epoch": 2.4735376044568245, + "grad_norm": 1.545333981513977, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8792545795440674, + "num_tokens": 485351878.0, + "step": 13320 + }, + { + "epoch": 2.47372330547818, + "grad_norm": 1.5628536939620972, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8937115669250488, + "num_tokens": 485388410.0, + "step": 13321 + }, + { + "epoch": 2.473909006499536, + "grad_norm": 1.605597972869873, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8816150426864624, + "num_tokens": 485427873.0, + "step": 13322 + }, + { + "epoch": 2.4740947075208912, + "grad_norm": 1.575579285621643, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8922954797744751, + "num_tokens": 485463087.0, + "step": 13323 + }, + { + "epoch": 2.474280408542247, + "grad_norm": 1.5815095901489258, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8740359544754028, + "num_tokens": 485504770.0, + "step": 13324 + }, + { + "epoch": 2.4744661095636027, + "grad_norm": 1.552768349647522, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.882899820804596, + "num_tokens": 485544185.0, + "step": 13325 + }, + { + "epoch": 2.474651810584958, + "grad_norm": 1.5027010440826416, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8852391242980957, + "num_tokens": 485582809.0, + "step": 13326 + }, + { + "epoch": 2.4748375116063137, + "grad_norm": 1.5143795013427734, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8959842324256897, + "num_tokens": 485622856.0, + "step": 13327 + }, + { + "epoch": 2.4750232126276694, + "grad_norm": 1.588800311088562, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8935756087303162, + "num_tokens": 485663017.0, + "step": 13328 + }, + { + "epoch": 2.475208913649025, + "grad_norm": 1.7472625970840454, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8763009309768677, + "num_tokens": 485696821.0, + "step": 13329 + }, + { + "epoch": 2.475394614670381, + "grad_norm": 1.5910248756408691, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8919373750686646, + "num_tokens": 485734004.0, + "step": 13330 + }, + { + "epoch": 2.475580315691736, + "grad_norm": 1.549206018447876, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8882254362106323, + "num_tokens": 485772894.0, + "step": 13331 + }, + { + "epoch": 2.475766016713092, + "grad_norm": 1.652788519859314, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8761866092681885, + "num_tokens": 485808376.0, + "step": 13332 + }, + { + "epoch": 2.4759517177344477, + "grad_norm": 1.6833916902542114, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8716866374015808, + "num_tokens": 485847767.0, + "step": 13333 + }, + { + "epoch": 2.476137418755803, + "grad_norm": 1.5618243217468262, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8916840553283691, + "num_tokens": 485887192.0, + "step": 13334 + }, + { + "epoch": 2.4763231197771587, + "grad_norm": 1.464771032333374, + "learning_rate": 1e-06, + "loss": 0.2676, + "mean_token_accuracy": 0.9021139740943909, + "num_tokens": 485923714.0, + "step": 13335 + }, + { + "epoch": 2.4765088207985144, + "grad_norm": 1.6288070678710938, + "learning_rate": 1e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9017274379730225, + "num_tokens": 485956839.0, + "step": 13336 + }, + { + "epoch": 2.47669452181987, + "grad_norm": 1.635105013847351, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.884032130241394, + "num_tokens": 485992028.0, + "step": 13337 + }, + { + "epoch": 2.4768802228412254, + "grad_norm": 1.7246989011764526, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8876711130142212, + "num_tokens": 486026347.0, + "step": 13338 + }, + { + "epoch": 2.477065923862581, + "grad_norm": 1.5868685245513916, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8866292834281921, + "num_tokens": 486064807.0, + "step": 13339 + }, + { + "epoch": 2.477251624883937, + "grad_norm": 1.6268994808197021, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.877334475517273, + "num_tokens": 486103027.0, + "step": 13340 + }, + { + "epoch": 2.4774373259052926, + "grad_norm": 1.6991124153137207, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8944238424301147, + "num_tokens": 486136679.0, + "step": 13341 + }, + { + "epoch": 2.477623026926648, + "grad_norm": 1.5355278253555298, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8811372518539429, + "num_tokens": 486178579.0, + "step": 13342 + }, + { + "epoch": 2.4778087279480037, + "grad_norm": 1.5070807933807373, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8871157169342041, + "num_tokens": 486216310.0, + "step": 13343 + }, + { + "epoch": 2.4779944289693594, + "grad_norm": 1.694900631904602, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8728044033050537, + "num_tokens": 486251741.0, + "step": 13344 + }, + { + "epoch": 2.478180129990715, + "grad_norm": 1.503780484199524, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8868767023086548, + "num_tokens": 486295434.0, + "step": 13345 + }, + { + "epoch": 2.4783658310120704, + "grad_norm": 1.5422980785369873, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8834560513496399, + "num_tokens": 486333180.0, + "step": 13346 + }, + { + "epoch": 2.478551532033426, + "grad_norm": 1.6140142679214478, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8894675374031067, + "num_tokens": 486372537.0, + "step": 13347 + }, + { + "epoch": 2.478737233054782, + "grad_norm": 1.7373762130737305, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.887081503868103, + "num_tokens": 486405781.0, + "step": 13348 + }, + { + "epoch": 2.478922934076137, + "grad_norm": 1.8908040523529053, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8897354006767273, + "num_tokens": 486434692.0, + "step": 13349 + }, + { + "epoch": 2.479108635097493, + "grad_norm": 1.4456286430358887, + "learning_rate": 1e-06, + "loss": 0.2661, + "mean_token_accuracy": 0.9019355177879333, + "num_tokens": 486475113.0, + "step": 13350 + }, + { + "epoch": 2.4792943361188486, + "grad_norm": 1.7071623802185059, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8840956091880798, + "num_tokens": 486509301.0, + "step": 13351 + }, + { + "epoch": 2.4794800371402044, + "grad_norm": 1.6684272289276123, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8901166915893555, + "num_tokens": 486541739.0, + "step": 13352 + }, + { + "epoch": 2.47966573816156, + "grad_norm": 1.6460659503936768, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8967746496200562, + "num_tokens": 486576007.0, + "step": 13353 + }, + { + "epoch": 2.4798514391829154, + "grad_norm": 1.5215126276016235, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.881928026676178, + "num_tokens": 486617298.0, + "step": 13354 + }, + { + "epoch": 2.480037140204271, + "grad_norm": 1.4649441242218018, + "learning_rate": 1e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9036899209022522, + "num_tokens": 486656765.0, + "step": 13355 + }, + { + "epoch": 2.480222841225627, + "grad_norm": 1.7279492616653442, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8844563961029053, + "num_tokens": 486692107.0, + "step": 13356 + }, + { + "epoch": 2.480408542246982, + "grad_norm": 1.7363210916519165, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8947478532791138, + "num_tokens": 486721825.0, + "step": 13357 + }, + { + "epoch": 2.480594243268338, + "grad_norm": 1.7345290184020996, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8770847916603088, + "num_tokens": 486758656.0, + "step": 13358 + }, + { + "epoch": 2.4807799442896936, + "grad_norm": 1.5190603733062744, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8879290819168091, + "num_tokens": 486800184.0, + "step": 13359 + }, + { + "epoch": 2.4809656453110494, + "grad_norm": 1.7794303894042969, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.885672926902771, + "num_tokens": 486835033.0, + "step": 13360 + }, + { + "epoch": 2.4811513463324046, + "grad_norm": 1.620611548423767, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.894579291343689, + "num_tokens": 486869042.0, + "step": 13361 + }, + { + "epoch": 2.4813370473537604, + "grad_norm": 1.6810033321380615, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8821136951446533, + "num_tokens": 486905111.0, + "step": 13362 + }, + { + "epoch": 2.481522748375116, + "grad_norm": 1.7694778442382812, + "learning_rate": 1e-06, + "loss": 0.2735, + "mean_token_accuracy": 0.9029108285903931, + "num_tokens": 486935146.0, + "step": 13363 + }, + { + "epoch": 2.481708449396472, + "grad_norm": 1.8116015195846558, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8769830465316772, + "num_tokens": 486965909.0, + "step": 13364 + }, + { + "epoch": 2.481894150417827, + "grad_norm": 1.6470363140106201, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8951661586761475, + "num_tokens": 486998854.0, + "step": 13365 + }, + { + "epoch": 2.482079851439183, + "grad_norm": 1.526163935661316, + "learning_rate": 1e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.9013066291809082, + "num_tokens": 487036391.0, + "step": 13366 + }, + { + "epoch": 2.4822655524605386, + "grad_norm": 1.7370920181274414, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8712349534034729, + "num_tokens": 487072812.0, + "step": 13367 + }, + { + "epoch": 2.4824512534818943, + "grad_norm": 1.5058488845825195, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8823949098587036, + "num_tokens": 487112136.0, + "step": 13368 + }, + { + "epoch": 2.4826369545032496, + "grad_norm": 1.7366114854812622, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8794639110565186, + "num_tokens": 487147347.0, + "step": 13369 + }, + { + "epoch": 2.4828226555246053, + "grad_norm": 1.6382770538330078, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8825786113739014, + "num_tokens": 487184869.0, + "step": 13370 + }, + { + "epoch": 2.483008356545961, + "grad_norm": 1.639990210533142, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8937110900878906, + "num_tokens": 487220143.0, + "step": 13371 + }, + { + "epoch": 2.4831940575673164, + "grad_norm": 1.6499147415161133, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8905274868011475, + "num_tokens": 487255607.0, + "step": 13372 + }, + { + "epoch": 2.483379758588672, + "grad_norm": 1.7877625226974487, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8919061422348022, + "num_tokens": 487286539.0, + "step": 13373 + }, + { + "epoch": 2.483565459610028, + "grad_norm": 1.723186731338501, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8895068168640137, + "num_tokens": 487319002.0, + "step": 13374 + }, + { + "epoch": 2.4837511606313836, + "grad_norm": 1.5817517042160034, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8939642310142517, + "num_tokens": 487357016.0, + "step": 13375 + }, + { + "epoch": 2.4839368616527393, + "grad_norm": 1.588645577430725, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8769117593765259, + "num_tokens": 487396942.0, + "step": 13376 + }, + { + "epoch": 2.4841225626740946, + "grad_norm": 1.5785218477249146, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8864217400550842, + "num_tokens": 487435361.0, + "step": 13377 + }, + { + "epoch": 2.4843082636954503, + "grad_norm": 1.6752116680145264, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.891544759273529, + "num_tokens": 487468758.0, + "step": 13378 + }, + { + "epoch": 2.484493964716806, + "grad_norm": 1.5058733224868774, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8898221850395203, + "num_tokens": 487505884.0, + "step": 13379 + }, + { + "epoch": 2.4846796657381613, + "grad_norm": 1.6291452646255493, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8770999908447266, + "num_tokens": 487543873.0, + "step": 13380 + }, + { + "epoch": 2.484865366759517, + "grad_norm": 1.7082364559173584, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8873735070228577, + "num_tokens": 487576992.0, + "step": 13381 + }, + { + "epoch": 2.485051067780873, + "grad_norm": 1.436793565750122, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8894122838973999, + "num_tokens": 487619492.0, + "step": 13382 + }, + { + "epoch": 2.4852367688022285, + "grad_norm": 1.5274983644485474, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8959590196609497, + "num_tokens": 487660851.0, + "step": 13383 + }, + { + "epoch": 2.4854224698235843, + "grad_norm": 1.687343955039978, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8785704970359802, + "num_tokens": 487696464.0, + "step": 13384 + }, + { + "epoch": 2.4856081708449396, + "grad_norm": 1.711865782737732, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.879479169845581, + "num_tokens": 487731392.0, + "step": 13385 + }, + { + "epoch": 2.4857938718662953, + "grad_norm": 1.577889323234558, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8926565647125244, + "num_tokens": 487767239.0, + "step": 13386 + }, + { + "epoch": 2.485979572887651, + "grad_norm": 1.488304853439331, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8963818550109863, + "num_tokens": 487807754.0, + "step": 13387 + }, + { + "epoch": 2.4861652739090063, + "grad_norm": 1.5553665161132812, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8879423141479492, + "num_tokens": 487845268.0, + "step": 13388 + }, + { + "epoch": 2.486350974930362, + "grad_norm": 1.7774150371551514, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8893049359321594, + "num_tokens": 487880848.0, + "step": 13389 + }, + { + "epoch": 2.486536675951718, + "grad_norm": 1.4643731117248535, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8881595730781555, + "num_tokens": 487921556.0, + "step": 13390 + }, + { + "epoch": 2.4867223769730735, + "grad_norm": 1.5528695583343506, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8897178769111633, + "num_tokens": 487958637.0, + "step": 13391 + }, + { + "epoch": 2.486908077994429, + "grad_norm": 1.5961947441101074, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8955886363983154, + "num_tokens": 487995710.0, + "step": 13392 + }, + { + "epoch": 2.4870937790157845, + "grad_norm": 1.4743142127990723, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8870052695274353, + "num_tokens": 488035905.0, + "step": 13393 + }, + { + "epoch": 2.4872794800371403, + "grad_norm": 1.4972622394561768, + "learning_rate": 1e-06, + "loss": 0.2486, + "mean_token_accuracy": 0.9072011113166809, + "num_tokens": 488071462.0, + "step": 13394 + }, + { + "epoch": 2.487465181058496, + "grad_norm": 1.6348929405212402, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8782733082771301, + "num_tokens": 488109268.0, + "step": 13395 + }, + { + "epoch": 2.4876508820798513, + "grad_norm": 1.6930514574050903, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8918206095695496, + "num_tokens": 488140796.0, + "step": 13396 + }, + { + "epoch": 2.487836583101207, + "grad_norm": 1.6134834289550781, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8891846537590027, + "num_tokens": 488180769.0, + "step": 13397 + }, + { + "epoch": 2.4880222841225628, + "grad_norm": 1.5984123945236206, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8819421529769897, + "num_tokens": 488219692.0, + "step": 13398 + }, + { + "epoch": 2.4882079851439185, + "grad_norm": 1.5803176164627075, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8760417699813843, + "num_tokens": 488259083.0, + "step": 13399 + }, + { + "epoch": 2.488393686165274, + "grad_norm": 1.603583812713623, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8805850744247437, + "num_tokens": 488296070.0, + "step": 13400 + }, + { + "epoch": 2.4885793871866295, + "grad_norm": 1.635350227355957, + "learning_rate": 1e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.8986972570419312, + "num_tokens": 488329776.0, + "step": 13401 + }, + { + "epoch": 2.4887650882079853, + "grad_norm": 1.7923411130905151, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8854852914810181, + "num_tokens": 488361450.0, + "step": 13402 + }, + { + "epoch": 2.4889507892293405, + "grad_norm": 1.5647644996643066, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9049044251441956, + "num_tokens": 488397659.0, + "step": 13403 + }, + { + "epoch": 2.4891364902506963, + "grad_norm": 1.6348097324371338, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8768434524536133, + "num_tokens": 488435534.0, + "step": 13404 + }, + { + "epoch": 2.489322191272052, + "grad_norm": 1.5590320825576782, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8921347260475159, + "num_tokens": 488471892.0, + "step": 13405 + }, + { + "epoch": 2.4895078922934077, + "grad_norm": 1.4399044513702393, + "learning_rate": 1e-06, + "loss": 0.2522, + "mean_token_accuracy": 0.9092051982879639, + "num_tokens": 488510965.0, + "step": 13406 + }, + { + "epoch": 2.4896935933147635, + "grad_norm": 1.4493366479873657, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8737972974777222, + "num_tokens": 488557570.0, + "step": 13407 + }, + { + "epoch": 2.4898792943361188, + "grad_norm": 1.607468843460083, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8649274110794067, + "num_tokens": 488600952.0, + "step": 13408 + }, + { + "epoch": 2.4900649953574745, + "grad_norm": 1.5261476039886475, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.892754077911377, + "num_tokens": 488637103.0, + "step": 13409 + }, + { + "epoch": 2.4902506963788302, + "grad_norm": 1.6192842721939087, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8823673129081726, + "num_tokens": 488674627.0, + "step": 13410 + }, + { + "epoch": 2.4904363974001855, + "grad_norm": 1.5590310096740723, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8905954360961914, + "num_tokens": 488714456.0, + "step": 13411 + }, + { + "epoch": 2.4906220984215413, + "grad_norm": 1.780454397201538, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8885874152183533, + "num_tokens": 488748287.0, + "step": 13412 + }, + { + "epoch": 2.490807799442897, + "grad_norm": 1.6845457553863525, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8860275745391846, + "num_tokens": 488784370.0, + "step": 13413 + }, + { + "epoch": 2.4909935004642527, + "grad_norm": 1.539971947669983, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8843990564346313, + "num_tokens": 488825692.0, + "step": 13414 + }, + { + "epoch": 2.491179201485608, + "grad_norm": 1.6350992918014526, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8828733563423157, + "num_tokens": 488861586.0, + "step": 13415 + }, + { + "epoch": 2.4913649025069637, + "grad_norm": 1.7098281383514404, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8799685835838318, + "num_tokens": 488899400.0, + "step": 13416 + }, + { + "epoch": 2.4915506035283195, + "grad_norm": 1.6446149349212646, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8786212205886841, + "num_tokens": 488937538.0, + "step": 13417 + }, + { + "epoch": 2.491736304549675, + "grad_norm": 1.613060712814331, + "learning_rate": 1e-06, + "loss": 0.279, + "mean_token_accuracy": 0.8976938724517822, + "num_tokens": 488972092.0, + "step": 13418 + }, + { + "epoch": 2.4919220055710305, + "grad_norm": 1.8233586549758911, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8810975551605225, + "num_tokens": 489007076.0, + "step": 13419 + }, + { + "epoch": 2.4921077065923862, + "grad_norm": 1.5024470090866089, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8827637434005737, + "num_tokens": 489048774.0, + "step": 13420 + }, + { + "epoch": 2.492293407613742, + "grad_norm": 1.7351396083831787, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8913123607635498, + "num_tokens": 489080992.0, + "step": 13421 + }, + { + "epoch": 2.4924791086350977, + "grad_norm": 1.6683555841445923, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8729368448257446, + "num_tokens": 489120349.0, + "step": 13422 + }, + { + "epoch": 2.492664809656453, + "grad_norm": 1.7297254800796509, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8653818368911743, + "num_tokens": 489157329.0, + "step": 13423 + }, + { + "epoch": 2.4928505106778087, + "grad_norm": 1.485032081604004, + "learning_rate": 1e-06, + "loss": 0.2598, + "mean_token_accuracy": 0.9036542177200317, + "num_tokens": 489193742.0, + "step": 13424 + }, + { + "epoch": 2.4930362116991645, + "grad_norm": 1.551578164100647, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8876672983169556, + "num_tokens": 489231013.0, + "step": 13425 + }, + { + "epoch": 2.4932219127205197, + "grad_norm": 1.619431972503662, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8908823132514954, + "num_tokens": 489264660.0, + "step": 13426 + }, + { + "epoch": 2.4934076137418755, + "grad_norm": 1.4736679792404175, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8923830986022949, + "num_tokens": 489304462.0, + "step": 13427 + }, + { + "epoch": 2.493593314763231, + "grad_norm": 1.7313932180404663, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8888040781021118, + "num_tokens": 489338713.0, + "step": 13428 + }, + { + "epoch": 2.493779015784587, + "grad_norm": 1.599937915802002, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8854560852050781, + "num_tokens": 489374144.0, + "step": 13429 + }, + { + "epoch": 2.4939647168059427, + "grad_norm": 1.7150835990905762, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.863548755645752, + "num_tokens": 489411144.0, + "step": 13430 + }, + { + "epoch": 2.494150417827298, + "grad_norm": 1.5233681201934814, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8958170413970947, + "num_tokens": 489448805.0, + "step": 13431 + }, + { + "epoch": 2.4943361188486537, + "grad_norm": 1.7932244539260864, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8863972425460815, + "num_tokens": 489478495.0, + "step": 13432 + }, + { + "epoch": 2.4945218198700094, + "grad_norm": 1.6256998777389526, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.891769528388977, + "num_tokens": 489512218.0, + "step": 13433 + }, + { + "epoch": 2.4947075208913647, + "grad_norm": 1.810928463935852, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8777221441268921, + "num_tokens": 489546120.0, + "step": 13434 + }, + { + "epoch": 2.4948932219127204, + "grad_norm": 1.6990227699279785, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8823509216308594, + "num_tokens": 489582124.0, + "step": 13435 + }, + { + "epoch": 2.495078922934076, + "grad_norm": 1.5220088958740234, + "learning_rate": 1e-06, + "loss": 0.2685, + "mean_token_accuracy": 0.9025737047195435, + "num_tokens": 489618260.0, + "step": 13436 + }, + { + "epoch": 2.495264623955432, + "grad_norm": 1.4877358675003052, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.8987584114074707, + "num_tokens": 489656684.0, + "step": 13437 + }, + { + "epoch": 2.495450324976787, + "grad_norm": 1.696514368057251, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8795111179351807, + "num_tokens": 489689297.0, + "step": 13438 + }, + { + "epoch": 2.495636025998143, + "grad_norm": 1.71696937084198, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8872345685958862, + "num_tokens": 489720405.0, + "step": 13439 + }, + { + "epoch": 2.4958217270194987, + "grad_norm": 1.5929224491119385, + "learning_rate": 1e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9009146690368652, + "num_tokens": 489757137.0, + "step": 13440 + }, + { + "epoch": 2.4960074280408544, + "grad_norm": 1.5732803344726562, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8729205131530762, + "num_tokens": 489798810.0, + "step": 13441 + }, + { + "epoch": 2.4961931290622097, + "grad_norm": 1.5491186380386353, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8853106498718262, + "num_tokens": 489840174.0, + "step": 13442 + }, + { + "epoch": 2.4963788300835654, + "grad_norm": 1.7695220708847046, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8871528506278992, + "num_tokens": 489871681.0, + "step": 13443 + }, + { + "epoch": 2.496564531104921, + "grad_norm": 1.5311520099639893, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8956364989280701, + "num_tokens": 489912035.0, + "step": 13444 + }, + { + "epoch": 2.496750232126277, + "grad_norm": 1.5635321140289307, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8864468336105347, + "num_tokens": 489952836.0, + "step": 13445 + }, + { + "epoch": 2.496935933147632, + "grad_norm": 1.5229055881500244, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8867001533508301, + "num_tokens": 489994386.0, + "step": 13446 + }, + { + "epoch": 2.497121634168988, + "grad_norm": 1.7203186750411987, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8811544179916382, + "num_tokens": 490025389.0, + "step": 13447 + }, + { + "epoch": 2.4973073351903436, + "grad_norm": 1.6050057411193848, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8768552541732788, + "num_tokens": 490063226.0, + "step": 13448 + }, + { + "epoch": 2.497493036211699, + "grad_norm": 1.5123289823532104, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8813310861587524, + "num_tokens": 490100826.0, + "step": 13449 + }, + { + "epoch": 2.4976787372330547, + "grad_norm": 1.6798325777053833, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8921303749084473, + "num_tokens": 490135430.0, + "step": 13450 + }, + { + "epoch": 2.4978644382544104, + "grad_norm": 1.4874560832977295, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8891693949699402, + "num_tokens": 490172735.0, + "step": 13451 + }, + { + "epoch": 2.498050139275766, + "grad_norm": 1.480932593345642, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8944201469421387, + "num_tokens": 490210717.0, + "step": 13452 + }, + { + "epoch": 2.498235840297122, + "grad_norm": 1.4613090753555298, + "learning_rate": 1e-06, + "loss": 0.256, + "mean_token_accuracy": 0.9062432050704956, + "num_tokens": 490247248.0, + "step": 13453 + }, + { + "epoch": 2.498421541318477, + "grad_norm": 1.5772340297698975, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8890931010246277, + "num_tokens": 490283918.0, + "step": 13454 + }, + { + "epoch": 2.498607242339833, + "grad_norm": 1.4632216691970825, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.8979240655899048, + "num_tokens": 490326291.0, + "step": 13455 + }, + { + "epoch": 2.4987929433611886, + "grad_norm": 1.5921406745910645, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8800278306007385, + "num_tokens": 490365213.0, + "step": 13456 + }, + { + "epoch": 2.498978644382544, + "grad_norm": 1.5911390781402588, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8756586313247681, + "num_tokens": 490405723.0, + "step": 13457 + }, + { + "epoch": 2.4991643454038996, + "grad_norm": 1.5969306230545044, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8857037425041199, + "num_tokens": 490444166.0, + "step": 13458 + }, + { + "epoch": 2.4993500464252554, + "grad_norm": 1.636957049369812, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.881781280040741, + "num_tokens": 490481746.0, + "step": 13459 + }, + { + "epoch": 2.499535747446611, + "grad_norm": 1.819967269897461, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8678805232048035, + "num_tokens": 490517626.0, + "step": 13460 + }, + { + "epoch": 2.4997214484679664, + "grad_norm": 1.711073637008667, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8944745063781738, + "num_tokens": 490550249.0, + "step": 13461 + }, + { + "epoch": 2.499907149489322, + "grad_norm": 1.471372127532959, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8900404572486877, + "num_tokens": 490595254.0, + "step": 13462 + }, + { + "epoch": 2.500092850510678, + "grad_norm": 1.606200933456421, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8789995908737183, + "num_tokens": 490633932.0, + "step": 13463 + }, + { + "epoch": 2.500278551532033, + "grad_norm": 1.5244780778884888, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8945670127868652, + "num_tokens": 490671131.0, + "step": 13464 + }, + { + "epoch": 2.500464252553389, + "grad_norm": 1.8037320375442505, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8829673528671265, + "num_tokens": 490702081.0, + "step": 13465 + }, + { + "epoch": 2.5006499535747446, + "grad_norm": 1.709906816482544, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.896683394908905, + "num_tokens": 490732916.0, + "step": 13466 + }, + { + "epoch": 2.5008356545961004, + "grad_norm": 1.7574681043624878, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8732426166534424, + "num_tokens": 490766344.0, + "step": 13467 + }, + { + "epoch": 2.501021355617456, + "grad_norm": 1.5624843835830688, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8689641952514648, + "num_tokens": 490806081.0, + "step": 13468 + }, + { + "epoch": 2.5012070566388114, + "grad_norm": 1.647364854812622, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8873370289802551, + "num_tokens": 490840127.0, + "step": 13469 + }, + { + "epoch": 2.501392757660167, + "grad_norm": 1.6387617588043213, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.9004788994789124, + "num_tokens": 490873982.0, + "step": 13470 + }, + { + "epoch": 2.501578458681523, + "grad_norm": 1.55437171459198, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8969913721084595, + "num_tokens": 490911206.0, + "step": 13471 + }, + { + "epoch": 2.501764159702878, + "grad_norm": 1.6556737422943115, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8805406093597412, + "num_tokens": 490944347.0, + "step": 13472 + }, + { + "epoch": 2.501949860724234, + "grad_norm": 1.6285085678100586, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8764806985855103, + "num_tokens": 490981370.0, + "step": 13473 + }, + { + "epoch": 2.5021355617455896, + "grad_norm": 1.5524425506591797, + "learning_rate": 1e-06, + "loss": 0.2713, + "mean_token_accuracy": 0.9013239145278931, + "num_tokens": 491013849.0, + "step": 13474 + }, + { + "epoch": 2.5023212627669453, + "grad_norm": 1.6295745372772217, + "learning_rate": 1e-06, + "loss": 0.2743, + "mean_token_accuracy": 0.9000478982925415, + "num_tokens": 491045756.0, + "step": 13475 + }, + { + "epoch": 2.502506963788301, + "grad_norm": 1.6005750894546509, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8912951946258545, + "num_tokens": 491082643.0, + "step": 13476 + }, + { + "epoch": 2.5026926648096564, + "grad_norm": 1.691236138343811, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8913577795028687, + "num_tokens": 491114819.0, + "step": 13477 + }, + { + "epoch": 2.502878365831012, + "grad_norm": 1.643187403678894, + "learning_rate": 1e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.8972249627113342, + "num_tokens": 491147072.0, + "step": 13478 + }, + { + "epoch": 2.503064066852368, + "grad_norm": 1.6286240816116333, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8818506002426147, + "num_tokens": 491182093.0, + "step": 13479 + }, + { + "epoch": 2.503249767873723, + "grad_norm": 1.6173175573349, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8854596614837646, + "num_tokens": 491217534.0, + "step": 13480 + }, + { + "epoch": 2.503435468895079, + "grad_norm": 1.6570695638656616, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8788177371025085, + "num_tokens": 491251656.0, + "step": 13481 + }, + { + "epoch": 2.5036211699164346, + "grad_norm": 1.6803481578826904, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8801859617233276, + "num_tokens": 491287198.0, + "step": 13482 + }, + { + "epoch": 2.5038068709377903, + "grad_norm": 1.490260124206543, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8937287926673889, + "num_tokens": 491328153.0, + "step": 13483 + }, + { + "epoch": 2.503992571959146, + "grad_norm": 1.7926256656646729, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8867784738540649, + "num_tokens": 491361501.0, + "step": 13484 + }, + { + "epoch": 2.5041782729805013, + "grad_norm": 1.7068103551864624, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8965879678726196, + "num_tokens": 491391273.0, + "step": 13485 + }, + { + "epoch": 2.504363974001857, + "grad_norm": 1.7242594957351685, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8866267800331116, + "num_tokens": 491421415.0, + "step": 13486 + }, + { + "epoch": 2.5045496750232124, + "grad_norm": 1.679642677307129, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8780304193496704, + "num_tokens": 491456699.0, + "step": 13487 + }, + { + "epoch": 2.504735376044568, + "grad_norm": 1.645394206047058, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8744670152664185, + "num_tokens": 491491450.0, + "step": 13488 + }, + { + "epoch": 2.504921077065924, + "grad_norm": 1.7223719358444214, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.878301739692688, + "num_tokens": 491525548.0, + "step": 13489 + }, + { + "epoch": 2.5051067780872796, + "grad_norm": 1.4974511861801147, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8926646113395691, + "num_tokens": 491562622.0, + "step": 13490 + }, + { + "epoch": 2.5052924791086353, + "grad_norm": 1.727088212966919, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8710111975669861, + "num_tokens": 491599196.0, + "step": 13491 + }, + { + "epoch": 2.5054781801299906, + "grad_norm": 1.5238982439041138, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8787558078765869, + "num_tokens": 491638252.0, + "step": 13492 + }, + { + "epoch": 2.5056638811513463, + "grad_norm": 1.5994272232055664, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.891082763671875, + "num_tokens": 491672726.0, + "step": 13493 + }, + { + "epoch": 2.505849582172702, + "grad_norm": 1.8548439741134644, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8689751625061035, + "num_tokens": 491706329.0, + "step": 13494 + }, + { + "epoch": 2.5060352831940573, + "grad_norm": 1.6433742046356201, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.887355387210846, + "num_tokens": 491743003.0, + "step": 13495 + }, + { + "epoch": 2.506220984215413, + "grad_norm": 1.6099435091018677, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8792833685874939, + "num_tokens": 491781194.0, + "step": 13496 + }, + { + "epoch": 2.506406685236769, + "grad_norm": 1.6332898139953613, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8724071979522705, + "num_tokens": 491819836.0, + "step": 13497 + }, + { + "epoch": 2.5065923862581245, + "grad_norm": 1.487358570098877, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8875977993011475, + "num_tokens": 491862406.0, + "step": 13498 + }, + { + "epoch": 2.5067780872794803, + "grad_norm": 1.561641812324524, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8951222896575928, + "num_tokens": 491895592.0, + "step": 13499 + }, + { + "epoch": 2.5069637883008355, + "grad_norm": 1.5444316864013672, + "learning_rate": 1e-06, + "loss": 0.2747, + "mean_token_accuracy": 0.9008760452270508, + "num_tokens": 491930097.0, + "step": 13500 + }, + { + "epoch": 2.5071494893221913, + "grad_norm": 1.7600045204162598, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8922097682952881, + "num_tokens": 491964060.0, + "step": 13501 + }, + { + "epoch": 2.507335190343547, + "grad_norm": 1.7565290927886963, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8819569945335388, + "num_tokens": 491999151.0, + "step": 13502 + }, + { + "epoch": 2.5075208913649023, + "grad_norm": 1.5510486364364624, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8803430795669556, + "num_tokens": 492039938.0, + "step": 13503 + }, + { + "epoch": 2.507706592386258, + "grad_norm": 1.4483978748321533, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8924107551574707, + "num_tokens": 492081345.0, + "step": 13504 + }, + { + "epoch": 2.5078922934076138, + "grad_norm": 1.500440239906311, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8978567123413086, + "num_tokens": 492120167.0, + "step": 13505 + }, + { + "epoch": 2.5080779944289695, + "grad_norm": 1.6383459568023682, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8826788663864136, + "num_tokens": 492157459.0, + "step": 13506 + }, + { + "epoch": 2.5082636954503252, + "grad_norm": 1.851833701133728, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8811469078063965, + "num_tokens": 492187424.0, + "step": 13507 + }, + { + "epoch": 2.5084493964716805, + "grad_norm": 1.6413859128952026, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8851169347763062, + "num_tokens": 492228244.0, + "step": 13508 + }, + { + "epoch": 2.5086350974930363, + "grad_norm": 1.6694360971450806, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.876456618309021, + "num_tokens": 492264613.0, + "step": 13509 + }, + { + "epoch": 2.508820798514392, + "grad_norm": 1.4905990362167358, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8855291604995728, + "num_tokens": 492306995.0, + "step": 13510 + }, + { + "epoch": 2.5090064995357473, + "grad_norm": 1.6472944021224976, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.893332302570343, + "num_tokens": 492342267.0, + "step": 13511 + }, + { + "epoch": 2.509192200557103, + "grad_norm": 1.5738756656646729, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.881916880607605, + "num_tokens": 492380196.0, + "step": 13512 + }, + { + "epoch": 2.5093779015784587, + "grad_norm": 1.6288843154907227, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8675922155380249, + "num_tokens": 492419702.0, + "step": 13513 + }, + { + "epoch": 2.5095636025998145, + "grad_norm": 1.6138261556625366, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8898195624351501, + "num_tokens": 492456157.0, + "step": 13514 + }, + { + "epoch": 2.5097493036211698, + "grad_norm": 1.5378705263137817, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8773057460784912, + "num_tokens": 492498297.0, + "step": 13515 + }, + { + "epoch": 2.5099350046425255, + "grad_norm": 1.4687690734863281, + "learning_rate": 1e-06, + "loss": 0.2677, + "mean_token_accuracy": 0.9023464918136597, + "num_tokens": 492536000.0, + "step": 13516 + }, + { + "epoch": 2.5101207056638812, + "grad_norm": 1.711552381515503, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8907435536384583, + "num_tokens": 492574983.0, + "step": 13517 + }, + { + "epoch": 2.5103064066852365, + "grad_norm": 1.6340711116790771, + "learning_rate": 1e-06, + "loss": 0.268, + "mean_token_accuracy": 0.9027988910675049, + "num_tokens": 492607212.0, + "step": 13518 + }, + { + "epoch": 2.5104921077065923, + "grad_norm": 1.6372877359390259, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.889366626739502, + "num_tokens": 492644067.0, + "step": 13519 + }, + { + "epoch": 2.510677808727948, + "grad_norm": 1.612053632736206, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8930697441101074, + "num_tokens": 492680792.0, + "step": 13520 + }, + { + "epoch": 2.5108635097493037, + "grad_norm": 1.59275221824646, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.886716365814209, + "num_tokens": 492719329.0, + "step": 13521 + }, + { + "epoch": 2.5110492107706595, + "grad_norm": 1.640197992324829, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8879798650741577, + "num_tokens": 492754582.0, + "step": 13522 + }, + { + "epoch": 2.5112349117920147, + "grad_norm": 1.591487169265747, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8914257287979126, + "num_tokens": 492789087.0, + "step": 13523 + }, + { + "epoch": 2.5114206128133705, + "grad_norm": 1.605435848236084, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8976100087165833, + "num_tokens": 492824231.0, + "step": 13524 + }, + { + "epoch": 2.511606313834726, + "grad_norm": 1.9343640804290771, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8634912371635437, + "num_tokens": 492860807.0, + "step": 13525 + }, + { + "epoch": 2.5117920148560815, + "grad_norm": 1.5644632577896118, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.896731972694397, + "num_tokens": 492897307.0, + "step": 13526 + }, + { + "epoch": 2.5119777158774372, + "grad_norm": 1.6631124019622803, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8852925300598145, + "num_tokens": 492932836.0, + "step": 13527 + }, + { + "epoch": 2.512163416898793, + "grad_norm": 1.7092729806900024, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8857424259185791, + "num_tokens": 492966923.0, + "step": 13528 + }, + { + "epoch": 2.5123491179201487, + "grad_norm": 1.8121713399887085, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8903523087501526, + "num_tokens": 493001232.0, + "step": 13529 + }, + { + "epoch": 2.5125348189415044, + "grad_norm": 1.6946014165878296, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8894389867782593, + "num_tokens": 493035513.0, + "step": 13530 + }, + { + "epoch": 2.5127205199628597, + "grad_norm": 1.5080387592315674, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8930219411849976, + "num_tokens": 493074612.0, + "step": 13531 + }, + { + "epoch": 2.5129062209842155, + "grad_norm": 1.5810099840164185, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8840540051460266, + "num_tokens": 493110429.0, + "step": 13532 + }, + { + "epoch": 2.513091922005571, + "grad_norm": 1.766135334968567, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8670020699501038, + "num_tokens": 493144332.0, + "step": 13533 + }, + { + "epoch": 2.5132776230269265, + "grad_norm": 1.83888578414917, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8729090690612793, + "num_tokens": 493176306.0, + "step": 13534 + }, + { + "epoch": 2.513463324048282, + "grad_norm": 1.7689777612686157, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8853005170822144, + "num_tokens": 493209939.0, + "step": 13535 + }, + { + "epoch": 2.513649025069638, + "grad_norm": 1.5869837999343872, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8866552114486694, + "num_tokens": 493248286.0, + "step": 13536 + }, + { + "epoch": 2.5138347260909937, + "grad_norm": 1.6054885387420654, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.8994234800338745, + "num_tokens": 493283436.0, + "step": 13537 + }, + { + "epoch": 2.514020427112349, + "grad_norm": 1.574623703956604, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8830370903015137, + "num_tokens": 493320660.0, + "step": 13538 + }, + { + "epoch": 2.5142061281337047, + "grad_norm": 1.6738923788070679, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.8967774510383606, + "num_tokens": 493353422.0, + "step": 13539 + }, + { + "epoch": 2.5143918291550604, + "grad_norm": 1.6571019887924194, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8637572526931763, + "num_tokens": 493393494.0, + "step": 13540 + }, + { + "epoch": 2.5145775301764157, + "grad_norm": 1.6904712915420532, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.880190372467041, + "num_tokens": 493426776.0, + "step": 13541 + }, + { + "epoch": 2.5147632311977715, + "grad_norm": 1.4052062034606934, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8818461894989014, + "num_tokens": 493477493.0, + "step": 13542 + }, + { + "epoch": 2.514948932219127, + "grad_norm": 1.689601182937622, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8918007612228394, + "num_tokens": 493510928.0, + "step": 13543 + }, + { + "epoch": 2.515134633240483, + "grad_norm": 1.7621015310287476, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8894186019897461, + "num_tokens": 493541675.0, + "step": 13544 + }, + { + "epoch": 2.5153203342618387, + "grad_norm": 1.5339819192886353, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8792335987091064, + "num_tokens": 493580596.0, + "step": 13545 + }, + { + "epoch": 2.515506035283194, + "grad_norm": 1.5957118272781372, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8903852105140686, + "num_tokens": 493617711.0, + "step": 13546 + }, + { + "epoch": 2.5156917363045497, + "grad_norm": 1.6063480377197266, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8829984068870544, + "num_tokens": 493652055.0, + "step": 13547 + }, + { + "epoch": 2.5158774373259054, + "grad_norm": 1.6524128913879395, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8772914409637451, + "num_tokens": 493689624.0, + "step": 13548 + }, + { + "epoch": 2.5160631383472607, + "grad_norm": 1.541744351387024, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8856331706047058, + "num_tokens": 493729223.0, + "step": 13549 + }, + { + "epoch": 2.5162488393686164, + "grad_norm": 1.639450192451477, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8859644532203674, + "num_tokens": 493765752.0, + "step": 13550 + }, + { + "epoch": 2.516434540389972, + "grad_norm": 1.737054705619812, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8827779293060303, + "num_tokens": 493795722.0, + "step": 13551 + }, + { + "epoch": 2.516620241411328, + "grad_norm": 1.6889524459838867, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8880493640899658, + "num_tokens": 493825911.0, + "step": 13552 + }, + { + "epoch": 2.5168059424326836, + "grad_norm": 1.8328664302825928, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8845562934875488, + "num_tokens": 493854714.0, + "step": 13553 + }, + { + "epoch": 2.516991643454039, + "grad_norm": 1.550755500793457, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8874828815460205, + "num_tokens": 493894280.0, + "step": 13554 + }, + { + "epoch": 2.5171773444753947, + "grad_norm": 1.6961755752563477, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8755861520767212, + "num_tokens": 493928302.0, + "step": 13555 + }, + { + "epoch": 2.5173630454967504, + "grad_norm": 1.6747578382492065, + "learning_rate": 1e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.8990931510925293, + "num_tokens": 493959733.0, + "step": 13556 + }, + { + "epoch": 2.5175487465181057, + "grad_norm": 1.6245583295822144, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8830487728118896, + "num_tokens": 493995875.0, + "step": 13557 + }, + { + "epoch": 2.5177344475394614, + "grad_norm": 1.5897997617721558, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8917351961135864, + "num_tokens": 494033742.0, + "step": 13558 + }, + { + "epoch": 2.517920148560817, + "grad_norm": 1.5647653341293335, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.89107745885849, + "num_tokens": 494071563.0, + "step": 13559 + }, + { + "epoch": 2.518105849582173, + "grad_norm": 1.5423381328582764, + "learning_rate": 1e-06, + "loss": 0.2587, + "mean_token_accuracy": 0.906262218952179, + "num_tokens": 494107917.0, + "step": 13560 + }, + { + "epoch": 2.518291550603528, + "grad_norm": 1.6108176708221436, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8994887471199036, + "num_tokens": 494149341.0, + "step": 13561 + }, + { + "epoch": 2.518477251624884, + "grad_norm": 1.6900311708450317, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8850961327552795, + "num_tokens": 494181418.0, + "step": 13562 + }, + { + "epoch": 2.5186629526462396, + "grad_norm": 1.790824294090271, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8863733410835266, + "num_tokens": 494214444.0, + "step": 13563 + }, + { + "epoch": 2.518848653667595, + "grad_norm": 1.6258697509765625, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8834308385848999, + "num_tokens": 494256381.0, + "step": 13564 + }, + { + "epoch": 2.5190343546889506, + "grad_norm": 1.726336121559143, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8868664503097534, + "num_tokens": 494292329.0, + "step": 13565 + }, + { + "epoch": 2.5192200557103064, + "grad_norm": 1.5505715608596802, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8809114694595337, + "num_tokens": 494332951.0, + "step": 13566 + }, + { + "epoch": 2.519405756731662, + "grad_norm": 1.4557371139526367, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8977420330047607, + "num_tokens": 494373503.0, + "step": 13567 + }, + { + "epoch": 2.519591457753018, + "grad_norm": 1.677317500114441, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8786306381225586, + "num_tokens": 494411361.0, + "step": 13568 + }, + { + "epoch": 2.519777158774373, + "grad_norm": 1.6590228080749512, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8856159448623657, + "num_tokens": 494450061.0, + "step": 13569 + }, + { + "epoch": 2.519962859795729, + "grad_norm": 1.4987776279449463, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.893875241279602, + "num_tokens": 494490175.0, + "step": 13570 + }, + { + "epoch": 2.5201485608170846, + "grad_norm": 1.7059314250946045, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8800359964370728, + "num_tokens": 494523930.0, + "step": 13571 + }, + { + "epoch": 2.52033426183844, + "grad_norm": 1.5529847145080566, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8935703039169312, + "num_tokens": 494561448.0, + "step": 13572 + }, + { + "epoch": 2.5205199628597956, + "grad_norm": 1.6674933433532715, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8850616216659546, + "num_tokens": 494596410.0, + "step": 13573 + }, + { + "epoch": 2.5207056638811514, + "grad_norm": 1.6533156633377075, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8671675324440002, + "num_tokens": 494636594.0, + "step": 13574 + }, + { + "epoch": 2.520891364902507, + "grad_norm": 1.4596977233886719, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8980488777160645, + "num_tokens": 494675512.0, + "step": 13575 + }, + { + "epoch": 2.521077065923863, + "grad_norm": 1.591916799545288, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8899835348129272, + "num_tokens": 494709867.0, + "step": 13576 + }, + { + "epoch": 2.521262766945218, + "grad_norm": 1.6905734539031982, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8811801671981812, + "num_tokens": 494742751.0, + "step": 13577 + }, + { + "epoch": 2.521448467966574, + "grad_norm": 1.78643000125885, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.890272319316864, + "num_tokens": 494771854.0, + "step": 13578 + }, + { + "epoch": 2.5216341689879296, + "grad_norm": 1.7459650039672852, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8801778554916382, + "num_tokens": 494806404.0, + "step": 13579 + }, + { + "epoch": 2.521819870009285, + "grad_norm": 1.5510116815567017, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.888843297958374, + "num_tokens": 494845974.0, + "step": 13580 + }, + { + "epoch": 2.5220055710306406, + "grad_norm": 1.567983865737915, + "learning_rate": 1e-06, + "loss": 0.2742, + "mean_token_accuracy": 0.8995106220245361, + "num_tokens": 494881761.0, + "step": 13581 + }, + { + "epoch": 2.5221912720519963, + "grad_norm": 1.623971700668335, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8924171328544617, + "num_tokens": 494916954.0, + "step": 13582 + }, + { + "epoch": 2.522376973073352, + "grad_norm": 1.677595615386963, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8894816637039185, + "num_tokens": 494950455.0, + "step": 13583 + }, + { + "epoch": 2.5225626740947074, + "grad_norm": 1.7794750928878784, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8853265047073364, + "num_tokens": 494987635.0, + "step": 13584 + }, + { + "epoch": 2.522748375116063, + "grad_norm": 1.7714190483093262, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8838469386100769, + "num_tokens": 495023602.0, + "step": 13585 + }, + { + "epoch": 2.522934076137419, + "grad_norm": 1.5961458683013916, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.897187352180481, + "num_tokens": 495055486.0, + "step": 13586 + }, + { + "epoch": 2.523119777158774, + "grad_norm": 1.696453332901001, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8727651834487915, + "num_tokens": 495097099.0, + "step": 13587 + }, + { + "epoch": 2.52330547818013, + "grad_norm": 1.623762845993042, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8851861953735352, + "num_tokens": 495132002.0, + "step": 13588 + }, + { + "epoch": 2.5234911792014856, + "grad_norm": 1.6838210821151733, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8926970958709717, + "num_tokens": 495167270.0, + "step": 13589 + }, + { + "epoch": 2.5236768802228413, + "grad_norm": 1.5304861068725586, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8843773007392883, + "num_tokens": 495208874.0, + "step": 13590 + }, + { + "epoch": 2.523862581244197, + "grad_norm": 1.6653186082839966, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8915709853172302, + "num_tokens": 495244668.0, + "step": 13591 + }, + { + "epoch": 2.5240482822655523, + "grad_norm": 1.5696403980255127, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8931422233581543, + "num_tokens": 495282778.0, + "step": 13592 + }, + { + "epoch": 2.524233983286908, + "grad_norm": 1.612726092338562, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8922333121299744, + "num_tokens": 495314833.0, + "step": 13593 + }, + { + "epoch": 2.524419684308264, + "grad_norm": 1.5495414733886719, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8931413888931274, + "num_tokens": 495351411.0, + "step": 13594 + }, + { + "epoch": 2.524605385329619, + "grad_norm": 1.6858960390090942, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8842675685882568, + "num_tokens": 495389693.0, + "step": 13595 + }, + { + "epoch": 2.524791086350975, + "grad_norm": 1.6505192518234253, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8853667974472046, + "num_tokens": 495427367.0, + "step": 13596 + }, + { + "epoch": 2.5249767873723306, + "grad_norm": 1.5427411794662476, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8850800395011902, + "num_tokens": 495467018.0, + "step": 13597 + }, + { + "epoch": 2.5251624883936863, + "grad_norm": 1.6815203428268433, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8895220756530762, + "num_tokens": 495502542.0, + "step": 13598 + }, + { + "epoch": 2.525348189415042, + "grad_norm": 1.6127959489822388, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8774599432945251, + "num_tokens": 495542429.0, + "step": 13599 + }, + { + "epoch": 2.5255338904363973, + "grad_norm": 1.7268482446670532, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8896066546440125, + "num_tokens": 495575385.0, + "step": 13600 + }, + { + "epoch": 2.525719591457753, + "grad_norm": 1.7457484006881714, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8639754056930542, + "num_tokens": 495610205.0, + "step": 13601 + }, + { + "epoch": 2.5259052924791088, + "grad_norm": 1.574703335762024, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8781833648681641, + "num_tokens": 495647988.0, + "step": 13602 + }, + { + "epoch": 2.526090993500464, + "grad_norm": 1.554033875465393, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8800168037414551, + "num_tokens": 495687288.0, + "step": 13603 + }, + { + "epoch": 2.52627669452182, + "grad_norm": 1.5047358274459839, + "learning_rate": 1e-06, + "loss": 0.2757, + "mean_token_accuracy": 0.8985614776611328, + "num_tokens": 495724550.0, + "step": 13604 + }, + { + "epoch": 2.5264623955431755, + "grad_norm": 1.5264735221862793, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.883481502532959, + "num_tokens": 495764967.0, + "step": 13605 + }, + { + "epoch": 2.5266480965645313, + "grad_norm": 1.6112666130065918, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8822305202484131, + "num_tokens": 495800189.0, + "step": 13606 + }, + { + "epoch": 2.526833797585887, + "grad_norm": 1.5507607460021973, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.889622151851654, + "num_tokens": 495839957.0, + "step": 13607 + }, + { + "epoch": 2.5270194986072423, + "grad_norm": 1.6452040672302246, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.889387309551239, + "num_tokens": 495873982.0, + "step": 13608 + }, + { + "epoch": 2.527205199628598, + "grad_norm": 1.6489323377609253, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8884825706481934, + "num_tokens": 495907637.0, + "step": 13609 + }, + { + "epoch": 2.5273909006499533, + "grad_norm": 1.531015157699585, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8808327913284302, + "num_tokens": 495947613.0, + "step": 13610 + }, + { + "epoch": 2.527576601671309, + "grad_norm": 1.7414454221725464, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8968396186828613, + "num_tokens": 495979804.0, + "step": 13611 + }, + { + "epoch": 2.5277623026926648, + "grad_norm": 1.9232255220413208, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8953820466995239, + "num_tokens": 496008030.0, + "step": 13612 + }, + { + "epoch": 2.5279480037140205, + "grad_norm": 1.5756969451904297, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8905457258224487, + "num_tokens": 496048780.0, + "step": 13613 + }, + { + "epoch": 2.5281337047353762, + "grad_norm": 1.6195780038833618, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8861052989959717, + "num_tokens": 496093723.0, + "step": 13614 + }, + { + "epoch": 2.5283194057567315, + "grad_norm": 1.649293065071106, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8871710300445557, + "num_tokens": 496125962.0, + "step": 13615 + }, + { + "epoch": 2.5285051067780873, + "grad_norm": 1.618291974067688, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.886844277381897, + "num_tokens": 496162137.0, + "step": 13616 + }, + { + "epoch": 2.528690807799443, + "grad_norm": 1.520930528640747, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8874630928039551, + "num_tokens": 496203650.0, + "step": 13617 + }, + { + "epoch": 2.5288765088207983, + "grad_norm": 1.5817053318023682, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.881631076335907, + "num_tokens": 496242892.0, + "step": 13618 + }, + { + "epoch": 2.529062209842154, + "grad_norm": 1.7329955101013184, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8901305794715881, + "num_tokens": 496275773.0, + "step": 13619 + }, + { + "epoch": 2.5292479108635098, + "grad_norm": 1.6463282108306885, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8884432315826416, + "num_tokens": 496313996.0, + "step": 13620 + }, + { + "epoch": 2.5294336118848655, + "grad_norm": 1.6757173538208008, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8618553876876831, + "num_tokens": 496351827.0, + "step": 13621 + }, + { + "epoch": 2.529619312906221, + "grad_norm": 1.7017531394958496, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8722248077392578, + "num_tokens": 496387898.0, + "step": 13622 + }, + { + "epoch": 2.5298050139275765, + "grad_norm": 1.8220503330230713, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8927059173583984, + "num_tokens": 496416512.0, + "step": 13623 + }, + { + "epoch": 2.5299907149489322, + "grad_norm": 1.7259440422058105, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8870160579681396, + "num_tokens": 496449673.0, + "step": 13624 + }, + { + "epoch": 2.530176415970288, + "grad_norm": 1.563456654548645, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8743578791618347, + "num_tokens": 496490266.0, + "step": 13625 + }, + { + "epoch": 2.5303621169916433, + "grad_norm": 1.6477785110473633, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.872586727142334, + "num_tokens": 496526170.0, + "step": 13626 + }, + { + "epoch": 2.530547818012999, + "grad_norm": 1.6826623678207397, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8863054513931274, + "num_tokens": 496560798.0, + "step": 13627 + }, + { + "epoch": 2.5307335190343547, + "grad_norm": 1.537847638130188, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9030325412750244, + "num_tokens": 496598006.0, + "step": 13628 + }, + { + "epoch": 2.5309192200557105, + "grad_norm": 1.5610315799713135, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8892862796783447, + "num_tokens": 496635235.0, + "step": 13629 + }, + { + "epoch": 2.531104921077066, + "grad_norm": 1.633962869644165, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8814212679862976, + "num_tokens": 496673259.0, + "step": 13630 + }, + { + "epoch": 2.5312906220984215, + "grad_norm": 1.5741292238235474, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8915146589279175, + "num_tokens": 496709766.0, + "step": 13631 + }, + { + "epoch": 2.531476323119777, + "grad_norm": 1.6084680557250977, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8862659931182861, + "num_tokens": 496747241.0, + "step": 13632 + }, + { + "epoch": 2.5316620241411325, + "grad_norm": 1.6654571294784546, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8748100996017456, + "num_tokens": 496783681.0, + "step": 13633 + }, + { + "epoch": 2.5318477251624882, + "grad_norm": 1.7103971242904663, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8946945667266846, + "num_tokens": 496812047.0, + "step": 13634 + }, + { + "epoch": 2.532033426183844, + "grad_norm": 1.5392292737960815, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8822773098945618, + "num_tokens": 496851211.0, + "step": 13635 + }, + { + "epoch": 2.5322191272051997, + "grad_norm": 1.509844183921814, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8881022930145264, + "num_tokens": 496890680.0, + "step": 13636 + }, + { + "epoch": 2.5324048282265554, + "grad_norm": 1.5447863340377808, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8815517425537109, + "num_tokens": 496931676.0, + "step": 13637 + }, + { + "epoch": 2.5325905292479107, + "grad_norm": 1.6212818622589111, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8947477340698242, + "num_tokens": 496970441.0, + "step": 13638 + }, + { + "epoch": 2.5327762302692665, + "grad_norm": 1.5319985151290894, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.872305154800415, + "num_tokens": 497011090.0, + "step": 13639 + }, + { + "epoch": 2.532961931290622, + "grad_norm": 1.6190756559371948, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8940730690956116, + "num_tokens": 497047267.0, + "step": 13640 + }, + { + "epoch": 2.5331476323119775, + "grad_norm": 1.622860312461853, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8765307664871216, + "num_tokens": 497085008.0, + "step": 13641 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 1.6283633708953857, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8895018100738525, + "num_tokens": 497118285.0, + "step": 13642 + }, + { + "epoch": 2.533519034354689, + "grad_norm": 1.5641282796859741, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8804731369018555, + "num_tokens": 497157047.0, + "step": 13643 + }, + { + "epoch": 2.5337047353760447, + "grad_norm": 1.5004440546035767, + "learning_rate": 1e-06, + "loss": 0.2628, + "mean_token_accuracy": 0.9060138463973999, + "num_tokens": 497191725.0, + "step": 13644 + }, + { + "epoch": 2.5338904363974004, + "grad_norm": 1.7102317810058594, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8842261433601379, + "num_tokens": 497226414.0, + "step": 13645 + }, + { + "epoch": 2.5340761374187557, + "grad_norm": 1.5779533386230469, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8935110569000244, + "num_tokens": 497261605.0, + "step": 13646 + }, + { + "epoch": 2.5342618384401114, + "grad_norm": 1.3774112462997437, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8901481628417969, + "num_tokens": 497305974.0, + "step": 13647 + }, + { + "epoch": 2.534447539461467, + "grad_norm": 1.5455381870269775, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.877776026725769, + "num_tokens": 497345644.0, + "step": 13648 + }, + { + "epoch": 2.5346332404828225, + "grad_norm": 1.700026035308838, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8855509757995605, + "num_tokens": 497380228.0, + "step": 13649 + }, + { + "epoch": 2.534818941504178, + "grad_norm": 1.6574368476867676, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8870072960853577, + "num_tokens": 497417062.0, + "step": 13650 + }, + { + "epoch": 2.535004642525534, + "grad_norm": 1.5825562477111816, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8868356347084045, + "num_tokens": 497452890.0, + "step": 13651 + }, + { + "epoch": 2.5351903435468897, + "grad_norm": 1.6088398694992065, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8722309470176697, + "num_tokens": 497488778.0, + "step": 13652 + }, + { + "epoch": 2.5353760445682454, + "grad_norm": 1.6048035621643066, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8931556940078735, + "num_tokens": 497524249.0, + "step": 13653 + }, + { + "epoch": 2.5355617455896007, + "grad_norm": 1.62968111038208, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8892515301704407, + "num_tokens": 497558326.0, + "step": 13654 + }, + { + "epoch": 2.5357474466109564, + "grad_norm": 1.731584906578064, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8837347030639648, + "num_tokens": 497589124.0, + "step": 13655 + }, + { + "epoch": 2.5359331476323117, + "grad_norm": 1.3844255208969116, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8964747190475464, + "num_tokens": 497633168.0, + "step": 13656 + }, + { + "epoch": 2.5361188486536674, + "grad_norm": 1.6456495523452759, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8934275507926941, + "num_tokens": 497667546.0, + "step": 13657 + }, + { + "epoch": 2.536304549675023, + "grad_norm": 1.4726593494415283, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8896244168281555, + "num_tokens": 497707211.0, + "step": 13658 + }, + { + "epoch": 2.536490250696379, + "grad_norm": 1.570255160331726, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8912440538406372, + "num_tokens": 497747641.0, + "step": 13659 + }, + { + "epoch": 2.5366759517177346, + "grad_norm": 1.6592954397201538, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8815253973007202, + "num_tokens": 497784531.0, + "step": 13660 + }, + { + "epoch": 2.53686165273909, + "grad_norm": 1.6289443969726562, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8749899864196777, + "num_tokens": 497822656.0, + "step": 13661 + }, + { + "epoch": 2.5370473537604457, + "grad_norm": 1.574856162071228, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.895698070526123, + "num_tokens": 497857612.0, + "step": 13662 + }, + { + "epoch": 2.5372330547818014, + "grad_norm": 1.5901663303375244, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8844856023788452, + "num_tokens": 497895034.0, + "step": 13663 + }, + { + "epoch": 2.5374187558031567, + "grad_norm": 1.749447226524353, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8774080276489258, + "num_tokens": 497930605.0, + "step": 13664 + }, + { + "epoch": 2.5376044568245124, + "grad_norm": 1.5649054050445557, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8941458463668823, + "num_tokens": 497967065.0, + "step": 13665 + }, + { + "epoch": 2.537790157845868, + "grad_norm": 1.556023359298706, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8856347799301147, + "num_tokens": 498003457.0, + "step": 13666 + }, + { + "epoch": 2.537975858867224, + "grad_norm": 1.692735195159912, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8877044916152954, + "num_tokens": 498039682.0, + "step": 13667 + }, + { + "epoch": 2.5381615598885796, + "grad_norm": 1.7705320119857788, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.889468789100647, + "num_tokens": 498073922.0, + "step": 13668 + }, + { + "epoch": 2.538347260909935, + "grad_norm": 1.5804548263549805, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.888609766960144, + "num_tokens": 498108142.0, + "step": 13669 + }, + { + "epoch": 2.5385329619312906, + "grad_norm": 1.6068224906921387, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8969742655754089, + "num_tokens": 498141524.0, + "step": 13670 + }, + { + "epoch": 2.5387186629526464, + "grad_norm": 1.7414348125457764, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8798340559005737, + "num_tokens": 498174867.0, + "step": 13671 + }, + { + "epoch": 2.5389043639740017, + "grad_norm": 1.6410585641860962, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8768423795700073, + "num_tokens": 498213624.0, + "step": 13672 + }, + { + "epoch": 2.5390900649953574, + "grad_norm": 1.6318448781967163, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8820353150367737, + "num_tokens": 498250395.0, + "step": 13673 + }, + { + "epoch": 2.539275766016713, + "grad_norm": 1.6103696823120117, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8748345375061035, + "num_tokens": 498293376.0, + "step": 13674 + }, + { + "epoch": 2.539461467038069, + "grad_norm": 1.6040129661560059, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8950619101524353, + "num_tokens": 498327073.0, + "step": 13675 + }, + { + "epoch": 2.5396471680594246, + "grad_norm": 1.6484946012496948, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8862674236297607, + "num_tokens": 498361370.0, + "step": 13676 + }, + { + "epoch": 2.53983286908078, + "grad_norm": 1.6804778575897217, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8899646997451782, + "num_tokens": 498395909.0, + "step": 13677 + }, + { + "epoch": 2.5400185701021356, + "grad_norm": 1.5214877128601074, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8784259557723999, + "num_tokens": 498433804.0, + "step": 13678 + }, + { + "epoch": 2.5402042711234913, + "grad_norm": 1.4472324848175049, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8976964354515076, + "num_tokens": 498474782.0, + "step": 13679 + }, + { + "epoch": 2.5403899721448466, + "grad_norm": 1.8231412172317505, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8846663236618042, + "num_tokens": 498505158.0, + "step": 13680 + }, + { + "epoch": 2.5405756731662024, + "grad_norm": 1.7264995574951172, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8858161568641663, + "num_tokens": 498539982.0, + "step": 13681 + }, + { + "epoch": 2.540761374187558, + "grad_norm": 1.577694058418274, + "learning_rate": 1e-06, + "loss": 0.28, + "mean_token_accuracy": 0.8989996314048767, + "num_tokens": 498577319.0, + "step": 13682 + }, + { + "epoch": 2.540947075208914, + "grad_norm": 1.608516812324524, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.892156183719635, + "num_tokens": 498612715.0, + "step": 13683 + }, + { + "epoch": 2.541132776230269, + "grad_norm": 1.6997630596160889, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8882642984390259, + "num_tokens": 498645518.0, + "step": 13684 + }, + { + "epoch": 2.541318477251625, + "grad_norm": 1.588780403137207, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8672279119491577, + "num_tokens": 498686081.0, + "step": 13685 + }, + { + "epoch": 2.5415041782729806, + "grad_norm": 1.5863405466079712, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8884215354919434, + "num_tokens": 498726462.0, + "step": 13686 + }, + { + "epoch": 2.541689879294336, + "grad_norm": 1.7986451387405396, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8871563673019409, + "num_tokens": 498760486.0, + "step": 13687 + }, + { + "epoch": 2.5418755803156916, + "grad_norm": 1.5841618776321411, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8815314769744873, + "num_tokens": 498804051.0, + "step": 13688 + }, + { + "epoch": 2.5420612813370473, + "grad_norm": 1.4575657844543457, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8845795392990112, + "num_tokens": 498850861.0, + "step": 13689 + }, + { + "epoch": 2.542246982358403, + "grad_norm": 1.672378659248352, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8956208825111389, + "num_tokens": 498882525.0, + "step": 13690 + }, + { + "epoch": 2.542432683379759, + "grad_norm": 1.7501391172409058, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8770697116851807, + "num_tokens": 498916935.0, + "step": 13691 + }, + { + "epoch": 2.542618384401114, + "grad_norm": 1.612079381942749, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8786349296569824, + "num_tokens": 498955875.0, + "step": 13692 + }, + { + "epoch": 2.54280408542247, + "grad_norm": 1.5678433179855347, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8939353227615356, + "num_tokens": 498996135.0, + "step": 13693 + }, + { + "epoch": 2.5429897864438256, + "grad_norm": 1.5452605485916138, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8882105350494385, + "num_tokens": 499034145.0, + "step": 13694 + }, + { + "epoch": 2.543175487465181, + "grad_norm": 1.475951910018921, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8953008055686951, + "num_tokens": 499074206.0, + "step": 13695 + }, + { + "epoch": 2.5433611884865366, + "grad_norm": 1.5700129270553589, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8891280889511108, + "num_tokens": 499109952.0, + "step": 13696 + }, + { + "epoch": 2.5435468895078923, + "grad_norm": 1.5526981353759766, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8889175057411194, + "num_tokens": 499146689.0, + "step": 13697 + }, + { + "epoch": 2.543732590529248, + "grad_norm": 1.6574615240097046, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8785495758056641, + "num_tokens": 499183757.0, + "step": 13698 + }, + { + "epoch": 2.543918291550604, + "grad_norm": 1.5616422891616821, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8966653347015381, + "num_tokens": 499220924.0, + "step": 13699 + }, + { + "epoch": 2.544103992571959, + "grad_norm": 1.520135521888733, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8944916725158691, + "num_tokens": 499258969.0, + "step": 13700 + }, + { + "epoch": 2.544289693593315, + "grad_norm": 1.6282914876937866, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8915325403213501, + "num_tokens": 499294601.0, + "step": 13701 + }, + { + "epoch": 2.5444753946146705, + "grad_norm": 1.5973824262619019, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8789118528366089, + "num_tokens": 499332425.0, + "step": 13702 + }, + { + "epoch": 2.544661095636026, + "grad_norm": 1.8618085384368896, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8745212554931641, + "num_tokens": 499364395.0, + "step": 13703 + }, + { + "epoch": 2.5448467966573816, + "grad_norm": 1.650809407234192, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8873634934425354, + "num_tokens": 499399132.0, + "step": 13704 + }, + { + "epoch": 2.5450324976787373, + "grad_norm": 1.4983470439910889, + "learning_rate": 1e-06, + "loss": 0.2533, + "mean_token_accuracy": 0.9083892107009888, + "num_tokens": 499432695.0, + "step": 13705 + }, + { + "epoch": 2.545218198700093, + "grad_norm": 1.5247857570648193, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8952172994613647, + "num_tokens": 499469946.0, + "step": 13706 + }, + { + "epoch": 2.5454038997214483, + "grad_norm": 1.586198329925537, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.899752140045166, + "num_tokens": 499503964.0, + "step": 13707 + }, + { + "epoch": 2.545589600742804, + "grad_norm": 1.5831451416015625, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8868774771690369, + "num_tokens": 499539380.0, + "step": 13708 + }, + { + "epoch": 2.54577530176416, + "grad_norm": 1.6369880437850952, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8813132047653198, + "num_tokens": 499579034.0, + "step": 13709 + }, + { + "epoch": 2.545961002785515, + "grad_norm": 1.5469732284545898, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8927003145217896, + "num_tokens": 499618029.0, + "step": 13710 + }, + { + "epoch": 2.546146703806871, + "grad_norm": 1.6439628601074219, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8907158970832825, + "num_tokens": 499649907.0, + "step": 13711 + }, + { + "epoch": 2.5463324048282265, + "grad_norm": 1.6929144859313965, + "learning_rate": 1e-06, + "loss": 0.2867, + "mean_token_accuracy": 0.8974932432174683, + "num_tokens": 499681201.0, + "step": 13712 + }, + { + "epoch": 2.5465181058495823, + "grad_norm": 1.7150509357452393, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8833373785018921, + "num_tokens": 499715945.0, + "step": 13713 + }, + { + "epoch": 2.546703806870938, + "grad_norm": 1.5705413818359375, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8903011679649353, + "num_tokens": 499752799.0, + "step": 13714 + }, + { + "epoch": 2.5468895078922933, + "grad_norm": 1.6007142066955566, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8959382176399231, + "num_tokens": 499787853.0, + "step": 13715 + }, + { + "epoch": 2.547075208913649, + "grad_norm": 1.663846731185913, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8903393745422363, + "num_tokens": 499825381.0, + "step": 13716 + }, + { + "epoch": 2.5472609099350048, + "grad_norm": 1.8115824460983276, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8776106834411621, + "num_tokens": 499855881.0, + "step": 13717 + }, + { + "epoch": 2.54744661095636, + "grad_norm": 1.5784350633621216, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8859251737594604, + "num_tokens": 499893749.0, + "step": 13718 + }, + { + "epoch": 2.547632311977716, + "grad_norm": 1.4147372245788574, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.8993608951568604, + "num_tokens": 499936479.0, + "step": 13719 + }, + { + "epoch": 2.5478180129990715, + "grad_norm": 1.5633503198623657, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.885090708732605, + "num_tokens": 499977289.0, + "step": 13720 + }, + { + "epoch": 2.5480037140204272, + "grad_norm": 1.7299093008041382, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8911796808242798, + "num_tokens": 500007491.0, + "step": 13721 + }, + { + "epoch": 2.548189415041783, + "grad_norm": 1.748432993888855, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8949234485626221, + "num_tokens": 500039225.0, + "step": 13722 + }, + { + "epoch": 2.5483751160631383, + "grad_norm": 1.4781070947647095, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8890606164932251, + "num_tokens": 500082203.0, + "step": 13723 + }, + { + "epoch": 2.548560817084494, + "grad_norm": 1.7014806270599365, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8636161088943481, + "num_tokens": 500120235.0, + "step": 13724 + }, + { + "epoch": 2.5487465181058497, + "grad_norm": 1.698775053024292, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8823931217193604, + "num_tokens": 500155460.0, + "step": 13725 + }, + { + "epoch": 2.548932219127205, + "grad_norm": 1.6047782897949219, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8879897594451904, + "num_tokens": 500192485.0, + "step": 13726 + }, + { + "epoch": 2.5491179201485608, + "grad_norm": 1.7668358087539673, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8718189001083374, + "num_tokens": 500226090.0, + "step": 13727 + }, + { + "epoch": 2.5493036211699165, + "grad_norm": 1.6140038967132568, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8972054123878479, + "num_tokens": 500262537.0, + "step": 13728 + }, + { + "epoch": 2.5494893221912722, + "grad_norm": 1.536819577217102, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8945983648300171, + "num_tokens": 500301161.0, + "step": 13729 + }, + { + "epoch": 2.5496750232126275, + "grad_norm": 1.7106091976165771, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8938112854957581, + "num_tokens": 500332532.0, + "step": 13730 + }, + { + "epoch": 2.5498607242339832, + "grad_norm": 1.5712984800338745, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8856345415115356, + "num_tokens": 500371275.0, + "step": 13731 + }, + { + "epoch": 2.550046425255339, + "grad_norm": 1.554653286933899, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8900334239006042, + "num_tokens": 500413321.0, + "step": 13732 + }, + { + "epoch": 2.5502321262766943, + "grad_norm": 1.763038992881775, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8764433860778809, + "num_tokens": 500446048.0, + "step": 13733 + }, + { + "epoch": 2.55041782729805, + "grad_norm": 1.7582656145095825, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8817524909973145, + "num_tokens": 500477354.0, + "step": 13734 + }, + { + "epoch": 2.5506035283194057, + "grad_norm": 1.6419651508331299, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8714781999588013, + "num_tokens": 500517522.0, + "step": 13735 + }, + { + "epoch": 2.5507892293407615, + "grad_norm": 1.7215501070022583, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8961035013198853, + "num_tokens": 500551694.0, + "step": 13736 + }, + { + "epoch": 2.550974930362117, + "grad_norm": 1.4778149127960205, + "learning_rate": 1e-06, + "loss": 0.2484, + "mean_token_accuracy": 0.9077169895172119, + "num_tokens": 500587853.0, + "step": 13737 + }, + { + "epoch": 2.5511606313834725, + "grad_norm": 1.7413841485977173, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8931649923324585, + "num_tokens": 500619248.0, + "step": 13738 + }, + { + "epoch": 2.551346332404828, + "grad_norm": 1.5931161642074585, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8897461295127869, + "num_tokens": 500654756.0, + "step": 13739 + }, + { + "epoch": 2.551532033426184, + "grad_norm": 1.45045006275177, + "learning_rate": 1e-06, + "loss": 0.2717, + "mean_token_accuracy": 0.9010353088378906, + "num_tokens": 500693697.0, + "step": 13740 + }, + { + "epoch": 2.5517177344475392, + "grad_norm": 1.510785698890686, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8873034715652466, + "num_tokens": 500734008.0, + "step": 13741 + }, + { + "epoch": 2.551903435468895, + "grad_norm": 1.7471299171447754, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.883474588394165, + "num_tokens": 500773096.0, + "step": 13742 + }, + { + "epoch": 2.5520891364902507, + "grad_norm": 1.8163083791732788, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8785381317138672, + "num_tokens": 500807035.0, + "step": 13743 + }, + { + "epoch": 2.5522748375116064, + "grad_norm": 1.416611909866333, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8966412544250488, + "num_tokens": 500851770.0, + "step": 13744 + }, + { + "epoch": 2.552460538532962, + "grad_norm": 1.5546600818634033, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8841285705566406, + "num_tokens": 500889925.0, + "step": 13745 + }, + { + "epoch": 2.5526462395543175, + "grad_norm": 1.6843043565750122, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8910456299781799, + "num_tokens": 500922841.0, + "step": 13746 + }, + { + "epoch": 2.552831940575673, + "grad_norm": 1.628494143486023, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8914046287536621, + "num_tokens": 500955555.0, + "step": 13747 + }, + { + "epoch": 2.553017641597029, + "grad_norm": 1.4875808954238892, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8931241035461426, + "num_tokens": 500994888.0, + "step": 13748 + }, + { + "epoch": 2.553203342618384, + "grad_norm": 1.6689636707305908, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8717073798179626, + "num_tokens": 501032650.0, + "step": 13749 + }, + { + "epoch": 2.55338904363974, + "grad_norm": 1.673814296722412, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8881086707115173, + "num_tokens": 501068969.0, + "step": 13750 + }, + { + "epoch": 2.5535747446610957, + "grad_norm": 1.5482897758483887, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8948615789413452, + "num_tokens": 501106642.0, + "step": 13751 + }, + { + "epoch": 2.5537604456824514, + "grad_norm": 1.5917577743530273, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8895673155784607, + "num_tokens": 501145229.0, + "step": 13752 + }, + { + "epoch": 2.5539461467038067, + "grad_norm": 1.6511621475219727, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8852810859680176, + "num_tokens": 501183807.0, + "step": 13753 + }, + { + "epoch": 2.5541318477251624, + "grad_norm": 1.672329068183899, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8764026165008545, + "num_tokens": 501221034.0, + "step": 13754 + }, + { + "epoch": 2.554317548746518, + "grad_norm": 1.494979739189148, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8898934125900269, + "num_tokens": 501261040.0, + "step": 13755 + }, + { + "epoch": 2.5545032497678735, + "grad_norm": 1.5668658018112183, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8911173939704895, + "num_tokens": 501297817.0, + "step": 13756 + }, + { + "epoch": 2.554688950789229, + "grad_norm": 1.8663634061813354, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8781311511993408, + "num_tokens": 501329030.0, + "step": 13757 + }, + { + "epoch": 2.554874651810585, + "grad_norm": 1.5146061182022095, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8828001022338867, + "num_tokens": 501372057.0, + "step": 13758 + }, + { + "epoch": 2.5550603528319407, + "grad_norm": 1.5594062805175781, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8710603713989258, + "num_tokens": 501414734.0, + "step": 13759 + }, + { + "epoch": 2.5552460538532964, + "grad_norm": 1.786745309829712, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8894404172897339, + "num_tokens": 501446025.0, + "step": 13760 + }, + { + "epoch": 2.5554317548746517, + "grad_norm": 1.6034777164459229, + "learning_rate": 1e-06, + "loss": 0.2717, + "mean_token_accuracy": 0.8998092412948608, + "num_tokens": 501477670.0, + "step": 13761 + }, + { + "epoch": 2.5556174558960074, + "grad_norm": 1.65397047996521, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8862554430961609, + "num_tokens": 501513132.0, + "step": 13762 + }, + { + "epoch": 2.555803156917363, + "grad_norm": 1.5936899185180664, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8914936184883118, + "num_tokens": 501547828.0, + "step": 13763 + }, + { + "epoch": 2.5559888579387184, + "grad_norm": 1.5601308345794678, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8863163590431213, + "num_tokens": 501586200.0, + "step": 13764 + }, + { + "epoch": 2.556174558960074, + "grad_norm": 1.6883513927459717, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8939468264579773, + "num_tokens": 501620721.0, + "step": 13765 + }, + { + "epoch": 2.55636025998143, + "grad_norm": 1.752423882484436, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8951953649520874, + "num_tokens": 501653536.0, + "step": 13766 + }, + { + "epoch": 2.5565459610027856, + "grad_norm": 1.6495516300201416, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8864579200744629, + "num_tokens": 501688458.0, + "step": 13767 + }, + { + "epoch": 2.5567316620241414, + "grad_norm": 1.7548351287841797, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8766629695892334, + "num_tokens": 501724255.0, + "step": 13768 + }, + { + "epoch": 2.5569173630454967, + "grad_norm": 1.656699538230896, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8963594436645508, + "num_tokens": 501758183.0, + "step": 13769 + }, + { + "epoch": 2.5571030640668524, + "grad_norm": 1.624450922012329, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8802990913391113, + "num_tokens": 501795560.0, + "step": 13770 + }, + { + "epoch": 2.557288765088208, + "grad_norm": 1.614493727684021, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.867415189743042, + "num_tokens": 501834887.0, + "step": 13771 + }, + { + "epoch": 2.5574744661095634, + "grad_norm": 1.5558656454086304, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8752635717391968, + "num_tokens": 501875126.0, + "step": 13772 + }, + { + "epoch": 2.557660167130919, + "grad_norm": 1.5473778247833252, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8768410682678223, + "num_tokens": 501915287.0, + "step": 13773 + }, + { + "epoch": 2.557845868152275, + "grad_norm": 1.7034904956817627, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8846749067306519, + "num_tokens": 501947205.0, + "step": 13774 + }, + { + "epoch": 2.5580315691736306, + "grad_norm": 1.463930606842041, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.898005485534668, + "num_tokens": 501987859.0, + "step": 13775 + }, + { + "epoch": 2.5582172701949863, + "grad_norm": 1.8250534534454346, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8964508771896362, + "num_tokens": 502016992.0, + "step": 13776 + }, + { + "epoch": 2.5584029712163416, + "grad_norm": 1.6441289186477661, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8963860273361206, + "num_tokens": 502050922.0, + "step": 13777 + }, + { + "epoch": 2.5585886722376974, + "grad_norm": 1.5017887353897095, + "learning_rate": 1e-06, + "loss": 0.2789, + "mean_token_accuracy": 0.8983409404754639, + "num_tokens": 502088163.0, + "step": 13778 + }, + { + "epoch": 2.5587743732590527, + "grad_norm": 1.5511835813522339, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8992023468017578, + "num_tokens": 502121269.0, + "step": 13779 + }, + { + "epoch": 2.5589600742804084, + "grad_norm": 1.6212903261184692, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8852808475494385, + "num_tokens": 502160593.0, + "step": 13780 + }, + { + "epoch": 2.559145775301764, + "grad_norm": 1.692349910736084, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8858005404472351, + "num_tokens": 502193496.0, + "step": 13781 + }, + { + "epoch": 2.55933147632312, + "grad_norm": 1.516409993171692, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9004389047622681, + "num_tokens": 502229442.0, + "step": 13782 + }, + { + "epoch": 2.5595171773444756, + "grad_norm": 1.6100783348083496, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8811079263687134, + "num_tokens": 502269065.0, + "step": 13783 + }, + { + "epoch": 2.559702878365831, + "grad_norm": 1.422361969947815, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8982915878295898, + "num_tokens": 502312313.0, + "step": 13784 + }, + { + "epoch": 2.5598885793871866, + "grad_norm": 1.7579890489578247, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.886711835861206, + "num_tokens": 502346981.0, + "step": 13785 + }, + { + "epoch": 2.5600742804085423, + "grad_norm": 1.811474084854126, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8813494443893433, + "num_tokens": 502382194.0, + "step": 13786 + }, + { + "epoch": 2.5602599814298976, + "grad_norm": 1.6683149337768555, + "learning_rate": 1e-06, + "loss": 0.2839, + "mean_token_accuracy": 0.8948641419410706, + "num_tokens": 502415206.0, + "step": 13787 + }, + { + "epoch": 2.5604456824512534, + "grad_norm": 1.5654046535491943, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8829859495162964, + "num_tokens": 502456184.0, + "step": 13788 + }, + { + "epoch": 2.560631383472609, + "grad_norm": 1.5512011051177979, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8854937553405762, + "num_tokens": 502499688.0, + "step": 13789 + }, + { + "epoch": 2.560817084493965, + "grad_norm": 1.694938063621521, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8824980854988098, + "num_tokens": 502535333.0, + "step": 13790 + }, + { + "epoch": 2.5610027855153206, + "grad_norm": 1.631251335144043, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8765357136726379, + "num_tokens": 502575948.0, + "step": 13791 + }, + { + "epoch": 2.561188486536676, + "grad_norm": 1.662959098815918, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8933529853820801, + "num_tokens": 502613326.0, + "step": 13792 + }, + { + "epoch": 2.5613741875580316, + "grad_norm": 1.5129815340042114, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8841630220413208, + "num_tokens": 502653010.0, + "step": 13793 + }, + { + "epoch": 2.5615598885793873, + "grad_norm": 1.784688949584961, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8858017921447754, + "num_tokens": 502684916.0, + "step": 13794 + }, + { + "epoch": 2.5617455896007426, + "grad_norm": 1.652958869934082, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.890473484992981, + "num_tokens": 502717423.0, + "step": 13795 + }, + { + "epoch": 2.5619312906220983, + "grad_norm": 1.6655447483062744, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8938521146774292, + "num_tokens": 502748533.0, + "step": 13796 + }, + { + "epoch": 2.562116991643454, + "grad_norm": 1.7559689283370972, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8954672813415527, + "num_tokens": 502780408.0, + "step": 13797 + }, + { + "epoch": 2.56230269266481, + "grad_norm": 1.6553243398666382, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8936165571212769, + "num_tokens": 502815131.0, + "step": 13798 + }, + { + "epoch": 2.5624883936861655, + "grad_norm": 1.4957658052444458, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8860756754875183, + "num_tokens": 502856814.0, + "step": 13799 + }, + { + "epoch": 2.562674094707521, + "grad_norm": 1.492573618888855, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8781813383102417, + "num_tokens": 502895188.0, + "step": 13800 + }, + { + "epoch": 2.5628597957288766, + "grad_norm": 1.6333411931991577, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8894229531288147, + "num_tokens": 502927278.0, + "step": 13801 + }, + { + "epoch": 2.563045496750232, + "grad_norm": 1.5276867151260376, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8884990215301514, + "num_tokens": 502966577.0, + "step": 13802 + }, + { + "epoch": 2.5632311977715876, + "grad_norm": 1.5817314386367798, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8731850385665894, + "num_tokens": 503006512.0, + "step": 13803 + }, + { + "epoch": 2.5634168987929433, + "grad_norm": 1.6434996128082275, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8971636891365051, + "num_tokens": 503040786.0, + "step": 13804 + }, + { + "epoch": 2.563602599814299, + "grad_norm": 1.5406593084335327, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.883433997631073, + "num_tokens": 503078926.0, + "step": 13805 + }, + { + "epoch": 2.563788300835655, + "grad_norm": 1.6711491346359253, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.895747184753418, + "num_tokens": 503110683.0, + "step": 13806 + }, + { + "epoch": 2.56397400185701, + "grad_norm": 1.5334792137145996, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8750485181808472, + "num_tokens": 503154225.0, + "step": 13807 + }, + { + "epoch": 2.564159702878366, + "grad_norm": 1.5658282041549683, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8940860629081726, + "num_tokens": 503189028.0, + "step": 13808 + }, + { + "epoch": 2.5643454038997215, + "grad_norm": 1.7205593585968018, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8796398043632507, + "num_tokens": 503225703.0, + "step": 13809 + }, + { + "epoch": 2.564531104921077, + "grad_norm": 1.6096144914627075, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8761327862739563, + "num_tokens": 503264728.0, + "step": 13810 + }, + { + "epoch": 2.5647168059424326, + "grad_norm": 1.4735921621322632, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8916552066802979, + "num_tokens": 503306209.0, + "step": 13811 + }, + { + "epoch": 2.5649025069637883, + "grad_norm": 1.5767525434494019, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8830211758613586, + "num_tokens": 503347478.0, + "step": 13812 + }, + { + "epoch": 2.565088207985144, + "grad_norm": 1.3882158994674683, + "learning_rate": 1e-06, + "loss": 0.2561, + "mean_token_accuracy": 0.906546950340271, + "num_tokens": 503386686.0, + "step": 13813 + }, + { + "epoch": 2.5652739090064998, + "grad_norm": 1.7511403560638428, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8911961317062378, + "num_tokens": 503421509.0, + "step": 13814 + }, + { + "epoch": 2.565459610027855, + "grad_norm": 1.5359445810317993, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8902752995491028, + "num_tokens": 503459530.0, + "step": 13815 + }, + { + "epoch": 2.565645311049211, + "grad_norm": 1.7294671535491943, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8699735403060913, + "num_tokens": 503493100.0, + "step": 13816 + }, + { + "epoch": 2.5658310120705665, + "grad_norm": 1.6763670444488525, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8960641622543335, + "num_tokens": 503523531.0, + "step": 13817 + }, + { + "epoch": 2.566016713091922, + "grad_norm": 2.1169564723968506, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8789421319961548, + "num_tokens": 503556134.0, + "step": 13818 + }, + { + "epoch": 2.5662024141132775, + "grad_norm": 1.5951110124588013, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8840494155883789, + "num_tokens": 503592923.0, + "step": 13819 + }, + { + "epoch": 2.5663881151346333, + "grad_norm": 1.599646806716919, + "learning_rate": 1e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.9068621397018433, + "num_tokens": 503626522.0, + "step": 13820 + }, + { + "epoch": 2.566573816155989, + "grad_norm": 1.6707743406295776, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8906128406524658, + "num_tokens": 503664453.0, + "step": 13821 + }, + { + "epoch": 2.5667595171773447, + "grad_norm": 1.5955711603164673, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8884752988815308, + "num_tokens": 503701790.0, + "step": 13822 + }, + { + "epoch": 2.5669452181987, + "grad_norm": 1.6151282787322998, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8800221085548401, + "num_tokens": 503740851.0, + "step": 13823 + }, + { + "epoch": 2.5671309192200558, + "grad_norm": 1.5682088136672974, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.893206000328064, + "num_tokens": 503776207.0, + "step": 13824 + }, + { + "epoch": 2.567316620241411, + "grad_norm": 1.6430164575576782, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8809828758239746, + "num_tokens": 503811504.0, + "step": 13825 + }, + { + "epoch": 2.567502321262767, + "grad_norm": 1.7342469692230225, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8809472918510437, + "num_tokens": 503846044.0, + "step": 13826 + }, + { + "epoch": 2.5676880222841225, + "grad_norm": 1.5903947353363037, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.891067624092102, + "num_tokens": 503881602.0, + "step": 13827 + }, + { + "epoch": 2.5678737233054783, + "grad_norm": 1.5940017700195312, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8848531246185303, + "num_tokens": 503921696.0, + "step": 13828 + }, + { + "epoch": 2.568059424326834, + "grad_norm": 1.4715029001235962, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8887285590171814, + "num_tokens": 503959722.0, + "step": 13829 + }, + { + "epoch": 2.5682451253481893, + "grad_norm": 1.5080597400665283, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8938194513320923, + "num_tokens": 503997385.0, + "step": 13830 + }, + { + "epoch": 2.568430826369545, + "grad_norm": 1.7072291374206543, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8798446655273438, + "num_tokens": 504035425.0, + "step": 13831 + }, + { + "epoch": 2.5686165273909007, + "grad_norm": 1.5402950048446655, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8746088743209839, + "num_tokens": 504081030.0, + "step": 13832 + }, + { + "epoch": 2.568802228412256, + "grad_norm": 1.5584577322006226, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8889200687408447, + "num_tokens": 504116488.0, + "step": 13833 + }, + { + "epoch": 2.5689879294336118, + "grad_norm": 1.6958962678909302, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8782659769058228, + "num_tokens": 504155428.0, + "step": 13834 + }, + { + "epoch": 2.5691736304549675, + "grad_norm": 1.432236671447754, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8993494510650635, + "num_tokens": 504195733.0, + "step": 13835 + }, + { + "epoch": 2.5693593314763232, + "grad_norm": 1.4492844343185425, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8863425254821777, + "num_tokens": 504236554.0, + "step": 13836 + }, + { + "epoch": 2.569545032497679, + "grad_norm": 1.6241527795791626, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8902612924575806, + "num_tokens": 504271332.0, + "step": 13837 + }, + { + "epoch": 2.5697307335190342, + "grad_norm": 1.4861149787902832, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8957118988037109, + "num_tokens": 504310504.0, + "step": 13838 + }, + { + "epoch": 2.56991643454039, + "grad_norm": 1.621063470840454, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8931628465652466, + "num_tokens": 504345528.0, + "step": 13839 + }, + { + "epoch": 2.5701021355617457, + "grad_norm": 1.5006015300750732, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8888909220695496, + "num_tokens": 504384722.0, + "step": 13840 + }, + { + "epoch": 2.570287836583101, + "grad_norm": 1.827510952949524, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8956387042999268, + "num_tokens": 504412826.0, + "step": 13841 + }, + { + "epoch": 2.5704735376044567, + "grad_norm": 1.6126049757003784, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8972875475883484, + "num_tokens": 504445276.0, + "step": 13842 + }, + { + "epoch": 2.5706592386258125, + "grad_norm": 1.5881891250610352, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8957415819168091, + "num_tokens": 504486349.0, + "step": 13843 + }, + { + "epoch": 2.570844939647168, + "grad_norm": 1.6658227443695068, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8823559284210205, + "num_tokens": 504518792.0, + "step": 13844 + }, + { + "epoch": 2.571030640668524, + "grad_norm": 1.7490787506103516, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8858714699745178, + "num_tokens": 504551991.0, + "step": 13845 + }, + { + "epoch": 2.5712163416898792, + "grad_norm": 1.8068773746490479, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8811751008033752, + "num_tokens": 504583863.0, + "step": 13846 + }, + { + "epoch": 2.571402042711235, + "grad_norm": 1.628950595855713, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8823578357696533, + "num_tokens": 504620843.0, + "step": 13847 + }, + { + "epoch": 2.5715877437325907, + "grad_norm": 1.6455342769622803, + "learning_rate": 1e-06, + "loss": 0.2654, + "mean_token_accuracy": 0.9026849269866943, + "num_tokens": 504653104.0, + "step": 13848 + }, + { + "epoch": 2.571773444753946, + "grad_norm": 1.5023757219314575, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8936105966567993, + "num_tokens": 504692372.0, + "step": 13849 + }, + { + "epoch": 2.5719591457753017, + "grad_norm": 1.6152513027191162, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8918932676315308, + "num_tokens": 504727542.0, + "step": 13850 + }, + { + "epoch": 2.5721448467966574, + "grad_norm": 1.5793309211730957, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8769519329071045, + "num_tokens": 504763810.0, + "step": 13851 + }, + { + "epoch": 2.572330547818013, + "grad_norm": 1.5931569337844849, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8961220383644104, + "num_tokens": 504797367.0, + "step": 13852 + }, + { + "epoch": 2.5725162488393685, + "grad_norm": 1.632096767425537, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8873896598815918, + "num_tokens": 504835276.0, + "step": 13853 + }, + { + "epoch": 2.572701949860724, + "grad_norm": 1.599374532699585, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8873060941696167, + "num_tokens": 504873863.0, + "step": 13854 + }, + { + "epoch": 2.57288765088208, + "grad_norm": 1.6639420986175537, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8707953095436096, + "num_tokens": 504908756.0, + "step": 13855 + }, + { + "epoch": 2.5730733519034352, + "grad_norm": 1.544296145439148, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8824360370635986, + "num_tokens": 504946968.0, + "step": 13856 + }, + { + "epoch": 2.573259052924791, + "grad_norm": 1.5058367252349854, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8870083093643188, + "num_tokens": 504984173.0, + "step": 13857 + }, + { + "epoch": 2.5734447539461467, + "grad_norm": 1.5997647047042847, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8784271478652954, + "num_tokens": 505022576.0, + "step": 13858 + }, + { + "epoch": 2.5736304549675024, + "grad_norm": 1.5418764352798462, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8867435455322266, + "num_tokens": 505058667.0, + "step": 13859 + }, + { + "epoch": 2.573816155988858, + "grad_norm": 1.5157753229141235, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8887823224067688, + "num_tokens": 505098134.0, + "step": 13860 + }, + { + "epoch": 2.5740018570102134, + "grad_norm": 1.669459342956543, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8792830109596252, + "num_tokens": 505132816.0, + "step": 13861 + }, + { + "epoch": 2.574187558031569, + "grad_norm": 1.7076966762542725, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.889733076095581, + "num_tokens": 505164499.0, + "step": 13862 + }, + { + "epoch": 2.574373259052925, + "grad_norm": 1.6144076585769653, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8953044414520264, + "num_tokens": 505202566.0, + "step": 13863 + }, + { + "epoch": 2.57455896007428, + "grad_norm": 1.3788156509399414, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9020315408706665, + "num_tokens": 505246845.0, + "step": 13864 + }, + { + "epoch": 2.574744661095636, + "grad_norm": 1.5875056982040405, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8793474435806274, + "num_tokens": 505284881.0, + "step": 13865 + }, + { + "epoch": 2.5749303621169917, + "grad_norm": 1.7477879524230957, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8761779069900513, + "num_tokens": 505317888.0, + "step": 13866 + }, + { + "epoch": 2.5751160631383474, + "grad_norm": 1.7284725904464722, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8712593913078308, + "num_tokens": 505351843.0, + "step": 13867 + }, + { + "epoch": 2.575301764159703, + "grad_norm": 1.5312743186950684, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8872609734535217, + "num_tokens": 505391029.0, + "step": 13868 + }, + { + "epoch": 2.5754874651810584, + "grad_norm": 1.5120278596878052, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8957982659339905, + "num_tokens": 505428386.0, + "step": 13869 + }, + { + "epoch": 2.575673166202414, + "grad_norm": 1.628308653831482, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8862689137458801, + "num_tokens": 505466478.0, + "step": 13870 + }, + { + "epoch": 2.57585886722377, + "grad_norm": 1.6238770484924316, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8772085905075073, + "num_tokens": 505505378.0, + "step": 13871 + }, + { + "epoch": 2.576044568245125, + "grad_norm": 1.5081627368927002, + "learning_rate": 1e-06, + "loss": 0.2583, + "mean_token_accuracy": 0.9054467678070068, + "num_tokens": 505545077.0, + "step": 13872 + }, + { + "epoch": 2.576230269266481, + "grad_norm": 1.5332177877426147, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8932915329933167, + "num_tokens": 505585840.0, + "step": 13873 + }, + { + "epoch": 2.5764159702878366, + "grad_norm": 1.7069567441940308, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8772246837615967, + "num_tokens": 505623090.0, + "step": 13874 + }, + { + "epoch": 2.5766016713091924, + "grad_norm": 1.5263426303863525, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8857119083404541, + "num_tokens": 505664063.0, + "step": 13875 + }, + { + "epoch": 2.5767873723305477, + "grad_norm": 1.6182832717895508, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.898584246635437, + "num_tokens": 505698515.0, + "step": 13876 + }, + { + "epoch": 2.5769730733519034, + "grad_norm": 1.540728211402893, + "learning_rate": 1e-06, + "loss": 0.2743, + "mean_token_accuracy": 0.9000484943389893, + "num_tokens": 505735119.0, + "step": 13877 + }, + { + "epoch": 2.577158774373259, + "grad_norm": 1.7785371541976929, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8619698882102966, + "num_tokens": 505769566.0, + "step": 13878 + }, + { + "epoch": 2.5773444753946144, + "grad_norm": 1.5273135900497437, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8878909945487976, + "num_tokens": 505806980.0, + "step": 13879 + }, + { + "epoch": 2.57753017641597, + "grad_norm": 1.5509568452835083, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8924500346183777, + "num_tokens": 505843305.0, + "step": 13880 + }, + { + "epoch": 2.577715877437326, + "grad_norm": 1.8614869117736816, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.878126859664917, + "num_tokens": 505874670.0, + "step": 13881 + }, + { + "epoch": 2.5779015784586816, + "grad_norm": 1.4727897644042969, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8851615190505981, + "num_tokens": 505915228.0, + "step": 13882 + }, + { + "epoch": 2.5780872794800374, + "grad_norm": 1.584233283996582, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.878571629524231, + "num_tokens": 505951293.0, + "step": 13883 + }, + { + "epoch": 2.5782729805013926, + "grad_norm": 1.6482614278793335, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8881809115409851, + "num_tokens": 505987844.0, + "step": 13884 + }, + { + "epoch": 2.5784586815227484, + "grad_norm": 1.7235170602798462, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8890178799629211, + "num_tokens": 506020393.0, + "step": 13885 + }, + { + "epoch": 2.578644382544104, + "grad_norm": 1.5186870098114014, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8841092586517334, + "num_tokens": 506058307.0, + "step": 13886 + }, + { + "epoch": 2.5788300835654594, + "grad_norm": 1.5244628190994263, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8883815407752991, + "num_tokens": 506095955.0, + "step": 13887 + }, + { + "epoch": 2.579015784586815, + "grad_norm": 1.7251628637313843, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8865377902984619, + "num_tokens": 506129535.0, + "step": 13888 + }, + { + "epoch": 2.579201485608171, + "grad_norm": 1.5511928796768188, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8683019876480103, + "num_tokens": 506171129.0, + "step": 13889 + }, + { + "epoch": 2.5793871866295266, + "grad_norm": 1.4851703643798828, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.8953149318695068, + "num_tokens": 506209970.0, + "step": 13890 + }, + { + "epoch": 2.5795728876508823, + "grad_norm": 1.6614320278167725, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8777689933776855, + "num_tokens": 506244946.0, + "step": 13891 + }, + { + "epoch": 2.5797585886722376, + "grad_norm": 1.5672608613967896, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8852320909500122, + "num_tokens": 506284585.0, + "step": 13892 + }, + { + "epoch": 2.5799442896935934, + "grad_norm": 1.5940072536468506, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8763126134872437, + "num_tokens": 506320865.0, + "step": 13893 + }, + { + "epoch": 2.580129990714949, + "grad_norm": 1.552680492401123, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8777795433998108, + "num_tokens": 506362277.0, + "step": 13894 + }, + { + "epoch": 2.5803156917363044, + "grad_norm": 1.549168348312378, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8879891633987427, + "num_tokens": 506397578.0, + "step": 13895 + }, + { + "epoch": 2.58050139275766, + "grad_norm": 1.8286148309707642, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8724339008331299, + "num_tokens": 506428148.0, + "step": 13896 + }, + { + "epoch": 2.580687093779016, + "grad_norm": 1.5554393529891968, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8971071243286133, + "num_tokens": 506463673.0, + "step": 13897 + }, + { + "epoch": 2.5808727948003716, + "grad_norm": 1.645293116569519, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8819974660873413, + "num_tokens": 506497167.0, + "step": 13898 + }, + { + "epoch": 2.581058495821727, + "grad_norm": 1.4677879810333252, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.8979595899581909, + "num_tokens": 506534054.0, + "step": 13899 + }, + { + "epoch": 2.5812441968430826, + "grad_norm": 1.58842134475708, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8891948461532593, + "num_tokens": 506571812.0, + "step": 13900 + }, + { + "epoch": 2.5814298978644383, + "grad_norm": 1.497798204421997, + "learning_rate": 1e-06, + "loss": 0.2753, + "mean_token_accuracy": 0.9037978649139404, + "num_tokens": 506608571.0, + "step": 13901 + }, + { + "epoch": 2.5816155988857936, + "grad_norm": 1.634960412979126, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8749029636383057, + "num_tokens": 506650615.0, + "step": 13902 + }, + { + "epoch": 2.5818012999071493, + "grad_norm": 1.6503790616989136, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.88726407289505, + "num_tokens": 506689037.0, + "step": 13903 + }, + { + "epoch": 2.581987000928505, + "grad_norm": 1.5772247314453125, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8835105299949646, + "num_tokens": 506729464.0, + "step": 13904 + }, + { + "epoch": 2.582172701949861, + "grad_norm": 1.5829452276229858, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8860853910446167, + "num_tokens": 506768232.0, + "step": 13905 + }, + { + "epoch": 2.5823584029712165, + "grad_norm": 1.6350593566894531, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8796952366828918, + "num_tokens": 506801742.0, + "step": 13906 + }, + { + "epoch": 2.582544103992572, + "grad_norm": 1.6932170391082764, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8851709365844727, + "num_tokens": 506834055.0, + "step": 13907 + }, + { + "epoch": 2.5827298050139276, + "grad_norm": 1.6360325813293457, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8888339400291443, + "num_tokens": 506869145.0, + "step": 13908 + }, + { + "epoch": 2.5829155060352833, + "grad_norm": 1.490299940109253, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8879086971282959, + "num_tokens": 506911505.0, + "step": 13909 + }, + { + "epoch": 2.5831012070566386, + "grad_norm": 1.6705176830291748, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8836650848388672, + "num_tokens": 506947294.0, + "step": 13910 + }, + { + "epoch": 2.5832869080779943, + "grad_norm": 1.5705358982086182, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8747984766960144, + "num_tokens": 506985400.0, + "step": 13911 + }, + { + "epoch": 2.58347260909935, + "grad_norm": 1.5926578044891357, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8832777738571167, + "num_tokens": 507025905.0, + "step": 13912 + }, + { + "epoch": 2.583658310120706, + "grad_norm": 1.6771416664123535, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8820321559906006, + "num_tokens": 507062553.0, + "step": 13913 + }, + { + "epoch": 2.5838440111420615, + "grad_norm": 1.6881314516067505, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8933963775634766, + "num_tokens": 507093243.0, + "step": 13914 + }, + { + "epoch": 2.584029712163417, + "grad_norm": 1.7693355083465576, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8733921051025391, + "num_tokens": 507124981.0, + "step": 13915 + }, + { + "epoch": 2.5842154131847725, + "grad_norm": 1.5795652866363525, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8939911127090454, + "num_tokens": 507158865.0, + "step": 13916 + }, + { + "epoch": 2.5844011142061283, + "grad_norm": 1.7594845294952393, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8790584802627563, + "num_tokens": 507195858.0, + "step": 13917 + }, + { + "epoch": 2.5845868152274836, + "grad_norm": 1.7491637468338013, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8782329559326172, + "num_tokens": 507231532.0, + "step": 13918 + }, + { + "epoch": 2.5847725162488393, + "grad_norm": 1.719435453414917, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8861954212188721, + "num_tokens": 507265197.0, + "step": 13919 + }, + { + "epoch": 2.584958217270195, + "grad_norm": 1.597104549407959, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8745808005332947, + "num_tokens": 507303070.0, + "step": 13920 + }, + { + "epoch": 2.5851439182915508, + "grad_norm": 1.5478469133377075, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8867143392562866, + "num_tokens": 507342962.0, + "step": 13921 + }, + { + "epoch": 2.585329619312906, + "grad_norm": 1.6456984281539917, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8727144598960876, + "num_tokens": 507381910.0, + "step": 13922 + }, + { + "epoch": 2.585515320334262, + "grad_norm": 1.544717788696289, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8926714658737183, + "num_tokens": 507421710.0, + "step": 13923 + }, + { + "epoch": 2.5857010213556175, + "grad_norm": 1.5178182125091553, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8959722518920898, + "num_tokens": 507457970.0, + "step": 13924 + }, + { + "epoch": 2.585886722376973, + "grad_norm": 1.5767520666122437, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8945162296295166, + "num_tokens": 507492753.0, + "step": 13925 + }, + { + "epoch": 2.5860724233983285, + "grad_norm": 1.7649385929107666, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8900423049926758, + "num_tokens": 507524715.0, + "step": 13926 + }, + { + "epoch": 2.5862581244196843, + "grad_norm": 1.5063011646270752, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8868830800056458, + "num_tokens": 507566183.0, + "step": 13927 + }, + { + "epoch": 2.58644382544104, + "grad_norm": 1.6087769269943237, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.88763427734375, + "num_tokens": 507601924.0, + "step": 13928 + }, + { + "epoch": 2.5866295264623957, + "grad_norm": 1.6981791257858276, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8960174322128296, + "num_tokens": 507634142.0, + "step": 13929 + }, + { + "epoch": 2.586815227483751, + "grad_norm": 1.526346206665039, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8959088325500488, + "num_tokens": 507672357.0, + "step": 13930 + }, + { + "epoch": 2.5870009285051068, + "grad_norm": 1.5943454504013062, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8969780206680298, + "num_tokens": 507706280.0, + "step": 13931 + }, + { + "epoch": 2.5871866295264625, + "grad_norm": 1.5941596031188965, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.891903281211853, + "num_tokens": 507740943.0, + "step": 13932 + }, + { + "epoch": 2.587372330547818, + "grad_norm": 1.4754996299743652, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8966284394264221, + "num_tokens": 507780497.0, + "step": 13933 + }, + { + "epoch": 2.5875580315691735, + "grad_norm": 1.6453100442886353, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8813827633857727, + "num_tokens": 507814716.0, + "step": 13934 + }, + { + "epoch": 2.5877437325905293, + "grad_norm": 1.5356708765029907, + "learning_rate": 1e-06, + "loss": 0.2684, + "mean_token_accuracy": 0.9011324644088745, + "num_tokens": 507852222.0, + "step": 13935 + }, + { + "epoch": 2.587929433611885, + "grad_norm": 1.569599986076355, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8871627449989319, + "num_tokens": 507888997.0, + "step": 13936 + }, + { + "epoch": 2.5881151346332407, + "grad_norm": 1.5666310787200928, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8954346179962158, + "num_tokens": 507927463.0, + "step": 13937 + }, + { + "epoch": 2.588300835654596, + "grad_norm": 1.4883503913879395, + "learning_rate": 1e-06, + "loss": 0.258, + "mean_token_accuracy": 0.9063142538070679, + "num_tokens": 507965040.0, + "step": 13938 + }, + { + "epoch": 2.5884865366759517, + "grad_norm": 1.533309817314148, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8925279378890991, + "num_tokens": 508002385.0, + "step": 13939 + }, + { + "epoch": 2.5886722376973075, + "grad_norm": 1.5849788188934326, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8973674774169922, + "num_tokens": 508036834.0, + "step": 13940 + }, + { + "epoch": 2.5888579387186628, + "grad_norm": 1.6185812950134277, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8834452629089355, + "num_tokens": 508070815.0, + "step": 13941 + }, + { + "epoch": 2.5890436397400185, + "grad_norm": 1.5306274890899658, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8812170028686523, + "num_tokens": 508111521.0, + "step": 13942 + }, + { + "epoch": 2.5892293407613742, + "grad_norm": 1.6386024951934814, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8843402862548828, + "num_tokens": 508149654.0, + "step": 13943 + }, + { + "epoch": 2.58941504178273, + "grad_norm": 1.5976673364639282, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.886777400970459, + "num_tokens": 508186675.0, + "step": 13944 + }, + { + "epoch": 2.5896007428040857, + "grad_norm": 1.500854730606079, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8797449469566345, + "num_tokens": 508228598.0, + "step": 13945 + }, + { + "epoch": 2.589786443825441, + "grad_norm": 1.7456552982330322, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8811534643173218, + "num_tokens": 508261216.0, + "step": 13946 + }, + { + "epoch": 2.5899721448467967, + "grad_norm": 1.614916443824768, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8818446397781372, + "num_tokens": 508297258.0, + "step": 13947 + }, + { + "epoch": 2.590157845868152, + "grad_norm": 1.636354923248291, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8754869699478149, + "num_tokens": 508333921.0, + "step": 13948 + }, + { + "epoch": 2.5903435468895077, + "grad_norm": 1.587990403175354, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.893397331237793, + "num_tokens": 508368449.0, + "step": 13949 + }, + { + "epoch": 2.5905292479108635, + "grad_norm": 1.7765729427337646, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.878592848777771, + "num_tokens": 508399965.0, + "step": 13950 + }, + { + "epoch": 2.590714948932219, + "grad_norm": 1.6555285453796387, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8865808248519897, + "num_tokens": 508434205.0, + "step": 13951 + }, + { + "epoch": 2.590900649953575, + "grad_norm": 1.5761775970458984, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8944392204284668, + "num_tokens": 508468673.0, + "step": 13952 + }, + { + "epoch": 2.5910863509749302, + "grad_norm": 1.524061679840088, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8826706409454346, + "num_tokens": 508508131.0, + "step": 13953 + }, + { + "epoch": 2.591272051996286, + "grad_norm": 1.6193833351135254, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8863047361373901, + "num_tokens": 508544024.0, + "step": 13954 + }, + { + "epoch": 2.5914577530176417, + "grad_norm": 1.4934310913085938, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8902429342269897, + "num_tokens": 508583810.0, + "step": 13955 + }, + { + "epoch": 2.591643454038997, + "grad_norm": 1.7094043493270874, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8853272199630737, + "num_tokens": 508620129.0, + "step": 13956 + }, + { + "epoch": 2.5918291550603527, + "grad_norm": 1.4565104246139526, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8864941596984863, + "num_tokens": 508664100.0, + "step": 13957 + }, + { + "epoch": 2.5920148560817085, + "grad_norm": 1.4798799753189087, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8934053182601929, + "num_tokens": 508704110.0, + "step": 13958 + }, + { + "epoch": 2.592200557103064, + "grad_norm": 1.624741554260254, + "learning_rate": 1e-06, + "loss": 0.276, + "mean_token_accuracy": 0.9001039862632751, + "num_tokens": 508737980.0, + "step": 13959 + }, + { + "epoch": 2.59238625812442, + "grad_norm": 1.6281689405441284, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8910086154937744, + "num_tokens": 508776346.0, + "step": 13960 + }, + { + "epoch": 2.592571959145775, + "grad_norm": 1.6833903789520264, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8852680921554565, + "num_tokens": 508812842.0, + "step": 13961 + }, + { + "epoch": 2.592757660167131, + "grad_norm": 1.6642894744873047, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8803198337554932, + "num_tokens": 508848891.0, + "step": 13962 + }, + { + "epoch": 2.5929433611884867, + "grad_norm": 1.4938427209854126, + "learning_rate": 1e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.8995484709739685, + "num_tokens": 508887010.0, + "step": 13963 + }, + { + "epoch": 2.593129062209842, + "grad_norm": 1.4511001110076904, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8838986754417419, + "num_tokens": 508932504.0, + "step": 13964 + }, + { + "epoch": 2.5933147632311977, + "grad_norm": 1.6018296480178833, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8911816477775574, + "num_tokens": 508969307.0, + "step": 13965 + }, + { + "epoch": 2.5935004642525534, + "grad_norm": 1.5627063512802124, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8918909430503845, + "num_tokens": 509008995.0, + "step": 13966 + }, + { + "epoch": 2.593686165273909, + "grad_norm": 1.5202666521072388, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.85976243019104, + "num_tokens": 509054995.0, + "step": 13967 + }, + { + "epoch": 2.593871866295265, + "grad_norm": 1.6130547523498535, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.885672390460968, + "num_tokens": 509090569.0, + "step": 13968 + }, + { + "epoch": 2.59405756731662, + "grad_norm": 1.6202831268310547, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8921360373497009, + "num_tokens": 509124524.0, + "step": 13969 + }, + { + "epoch": 2.594243268337976, + "grad_norm": 1.5344880819320679, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8800454139709473, + "num_tokens": 509165254.0, + "step": 13970 + }, + { + "epoch": 2.594428969359331, + "grad_norm": 1.8849799633026123, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8737872838973999, + "num_tokens": 509195220.0, + "step": 13971 + }, + { + "epoch": 2.594614670380687, + "grad_norm": 1.7145413160324097, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8848731517791748, + "num_tokens": 509227592.0, + "step": 13972 + }, + { + "epoch": 2.5948003714020427, + "grad_norm": 1.7006750106811523, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8759667873382568, + "num_tokens": 509261324.0, + "step": 13973 + }, + { + "epoch": 2.5949860724233984, + "grad_norm": 1.7739897966384888, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8790305256843567, + "num_tokens": 509294559.0, + "step": 13974 + }, + { + "epoch": 2.595171773444754, + "grad_norm": 1.8471664190292358, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8881347179412842, + "num_tokens": 509323814.0, + "step": 13975 + }, + { + "epoch": 2.5953574744661094, + "grad_norm": 1.8144683837890625, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8789459466934204, + "num_tokens": 509356833.0, + "step": 13976 + }, + { + "epoch": 2.595543175487465, + "grad_norm": 1.714975357055664, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8812406659126282, + "num_tokens": 509389749.0, + "step": 13977 + }, + { + "epoch": 2.595728876508821, + "grad_norm": 1.5523055791854858, + "learning_rate": 1e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.8954045176506042, + "num_tokens": 509425617.0, + "step": 13978 + }, + { + "epoch": 2.595914577530176, + "grad_norm": 1.6097431182861328, + "learning_rate": 1e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.9011912941932678, + "num_tokens": 509463719.0, + "step": 13979 + }, + { + "epoch": 2.596100278551532, + "grad_norm": 1.867614984512329, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8753167986869812, + "num_tokens": 509496321.0, + "step": 13980 + }, + { + "epoch": 2.5962859795728876, + "grad_norm": 1.657336711883545, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8910355567932129, + "num_tokens": 509531167.0, + "step": 13981 + }, + { + "epoch": 2.5964716805942434, + "grad_norm": 1.4530442953109741, + "learning_rate": 1e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.8999204635620117, + "num_tokens": 509570273.0, + "step": 13982 + }, + { + "epoch": 2.596657381615599, + "grad_norm": 1.5991828441619873, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.88677978515625, + "num_tokens": 509604846.0, + "step": 13983 + }, + { + "epoch": 2.5968430826369544, + "grad_norm": 1.5894038677215576, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8812544941902161, + "num_tokens": 509645068.0, + "step": 13984 + }, + { + "epoch": 2.59702878365831, + "grad_norm": 1.6079269647598267, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8904532194137573, + "num_tokens": 509677982.0, + "step": 13985 + }, + { + "epoch": 2.597214484679666, + "grad_norm": 1.641750454902649, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8792014122009277, + "num_tokens": 509714473.0, + "step": 13986 + }, + { + "epoch": 2.597400185701021, + "grad_norm": 1.600902795791626, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.88620924949646, + "num_tokens": 509753684.0, + "step": 13987 + }, + { + "epoch": 2.597585886722377, + "grad_norm": 1.558030128479004, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8918291926383972, + "num_tokens": 509788617.0, + "step": 13988 + }, + { + "epoch": 2.5977715877437326, + "grad_norm": 1.6105910539627075, + "learning_rate": 1e-06, + "loss": 0.2553, + "mean_token_accuracy": 0.9059568047523499, + "num_tokens": 509820106.0, + "step": 13989 + }, + { + "epoch": 2.5979572887650884, + "grad_norm": 1.6592340469360352, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.876481831073761, + "num_tokens": 509855748.0, + "step": 13990 + }, + { + "epoch": 2.598142989786444, + "grad_norm": 1.5243562459945679, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8925431966781616, + "num_tokens": 509891602.0, + "step": 13991 + }, + { + "epoch": 2.5983286908077994, + "grad_norm": 1.614170789718628, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.881963849067688, + "num_tokens": 509929163.0, + "step": 13992 + }, + { + "epoch": 2.598514391829155, + "grad_norm": 1.5481315851211548, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8961222767829895, + "num_tokens": 509965223.0, + "step": 13993 + }, + { + "epoch": 2.5987000928505104, + "grad_norm": 1.5539119243621826, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8956965208053589, + "num_tokens": 509999090.0, + "step": 13994 + }, + { + "epoch": 2.598885793871866, + "grad_norm": 1.6352308988571167, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.897062361240387, + "num_tokens": 510029994.0, + "step": 13995 + }, + { + "epoch": 2.599071494893222, + "grad_norm": 1.540520429611206, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8947859406471252, + "num_tokens": 510063133.0, + "step": 13996 + }, + { + "epoch": 2.5992571959145776, + "grad_norm": 1.5170016288757324, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8825654983520508, + "num_tokens": 510101313.0, + "step": 13997 + }, + { + "epoch": 2.5994428969359333, + "grad_norm": 1.5507426261901855, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8935714960098267, + "num_tokens": 510140315.0, + "step": 13998 + }, + { + "epoch": 2.5996285979572886, + "grad_norm": 1.6945409774780273, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8944738507270813, + "num_tokens": 510176218.0, + "step": 13999 + }, + { + "epoch": 2.5998142989786444, + "grad_norm": 1.5147669315338135, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.896329939365387, + "num_tokens": 510213119.0, + "step": 14000 + }, + { + "epoch": 2.6, + "grad_norm": 1.6516677141189575, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8919105529785156, + "num_tokens": 510246898.0, + "step": 14001 + }, + { + "epoch": 2.6001857010213554, + "grad_norm": 1.5256098508834839, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8902809619903564, + "num_tokens": 510286436.0, + "step": 14002 + }, + { + "epoch": 2.600371402042711, + "grad_norm": 1.7921713590621948, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8914614319801331, + "num_tokens": 510318182.0, + "step": 14003 + }, + { + "epoch": 2.600557103064067, + "grad_norm": 1.730065941810608, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8885433673858643, + "num_tokens": 510349129.0, + "step": 14004 + }, + { + "epoch": 2.6007428040854226, + "grad_norm": 1.5307992696762085, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.8983190059661865, + "num_tokens": 510383774.0, + "step": 14005 + }, + { + "epoch": 2.6009285051067783, + "grad_norm": 1.5889763832092285, + "learning_rate": 1e-06, + "loss": 0.2703, + "mean_token_accuracy": 0.9000111818313599, + "num_tokens": 510418598.0, + "step": 14006 + }, + { + "epoch": 2.6011142061281336, + "grad_norm": 1.8637259006500244, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8912737369537354, + "num_tokens": 510448208.0, + "step": 14007 + }, + { + "epoch": 2.6012999071494893, + "grad_norm": 1.5760512351989746, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8855570554733276, + "num_tokens": 510484594.0, + "step": 14008 + }, + { + "epoch": 2.601485608170845, + "grad_norm": 1.8193410634994507, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8680424690246582, + "num_tokens": 510517404.0, + "step": 14009 + }, + { + "epoch": 2.6016713091922004, + "grad_norm": 1.6599477529525757, + "learning_rate": 1e-06, + "loss": 0.2489, + "mean_token_accuracy": 0.9068350195884705, + "num_tokens": 510545112.0, + "step": 14010 + }, + { + "epoch": 2.601857010213556, + "grad_norm": 1.6344891786575317, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8864881992340088, + "num_tokens": 510581385.0, + "step": 14011 + }, + { + "epoch": 2.602042711234912, + "grad_norm": 1.6052417755126953, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8834643363952637, + "num_tokens": 510617175.0, + "step": 14012 + }, + { + "epoch": 2.6022284122562676, + "grad_norm": 1.836441993713379, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8838478326797485, + "num_tokens": 510646349.0, + "step": 14013 + }, + { + "epoch": 2.6024141132776233, + "grad_norm": 1.7140045166015625, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.887920618057251, + "num_tokens": 510678182.0, + "step": 14014 + }, + { + "epoch": 2.6025998142989786, + "grad_norm": 1.626818060874939, + "learning_rate": 1e-06, + "loss": 0.2735, + "mean_token_accuracy": 0.8992636799812317, + "num_tokens": 510711039.0, + "step": 14015 + }, + { + "epoch": 2.6027855153203343, + "grad_norm": 1.6015360355377197, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.893459141254425, + "num_tokens": 510747548.0, + "step": 14016 + }, + { + "epoch": 2.60297121634169, + "grad_norm": 1.588469386100769, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8912951350212097, + "num_tokens": 510781566.0, + "step": 14017 + }, + { + "epoch": 2.6031569173630453, + "grad_norm": 1.5844306945800781, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8901505470275879, + "num_tokens": 510822371.0, + "step": 14018 + }, + { + "epoch": 2.603342618384401, + "grad_norm": 1.6008727550506592, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8935285806655884, + "num_tokens": 510858338.0, + "step": 14019 + }, + { + "epoch": 2.603528319405757, + "grad_norm": 1.7352638244628906, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8920938968658447, + "num_tokens": 510890385.0, + "step": 14020 + }, + { + "epoch": 2.6037140204271125, + "grad_norm": 1.615576148033142, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8893353939056396, + "num_tokens": 510928625.0, + "step": 14021 + }, + { + "epoch": 2.603899721448468, + "grad_norm": 1.635015606880188, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8803406953811646, + "num_tokens": 510964354.0, + "step": 14022 + }, + { + "epoch": 2.6040854224698236, + "grad_norm": 1.416514277458191, + "learning_rate": 1e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9022092223167419, + "num_tokens": 511003645.0, + "step": 14023 + }, + { + "epoch": 2.6042711234911793, + "grad_norm": 1.5700435638427734, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8705264925956726, + "num_tokens": 511043816.0, + "step": 14024 + }, + { + "epoch": 2.6044568245125346, + "grad_norm": 1.5806657075881958, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8822287917137146, + "num_tokens": 511085046.0, + "step": 14025 + }, + { + "epoch": 2.6046425255338903, + "grad_norm": 1.7973136901855469, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8964394927024841, + "num_tokens": 511112213.0, + "step": 14026 + }, + { + "epoch": 2.604828226555246, + "grad_norm": 1.5110398530960083, + "learning_rate": 1e-06, + "loss": 0.2632, + "mean_token_accuracy": 0.9053753614425659, + "num_tokens": 511151386.0, + "step": 14027 + }, + { + "epoch": 2.6050139275766018, + "grad_norm": 1.7423920631408691, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8752042055130005, + "num_tokens": 511183929.0, + "step": 14028 + }, + { + "epoch": 2.6051996285979575, + "grad_norm": 1.5922008752822876, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8840171694755554, + "num_tokens": 511219652.0, + "step": 14029 + }, + { + "epoch": 2.605385329619313, + "grad_norm": 1.4845117330551147, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8898241519927979, + "num_tokens": 511262751.0, + "step": 14030 + }, + { + "epoch": 2.6055710306406685, + "grad_norm": 1.674910068511963, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8839290142059326, + "num_tokens": 511300212.0, + "step": 14031 + }, + { + "epoch": 2.6057567316620243, + "grad_norm": 1.7385845184326172, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8783838748931885, + "num_tokens": 511335042.0, + "step": 14032 + }, + { + "epoch": 2.6059424326833796, + "grad_norm": 1.5171359777450562, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8862975835800171, + "num_tokens": 511375462.0, + "step": 14033 + }, + { + "epoch": 2.6061281337047353, + "grad_norm": 1.4983320236206055, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8885529637336731, + "num_tokens": 511415422.0, + "step": 14034 + }, + { + "epoch": 2.606313834726091, + "grad_norm": 1.5258715152740479, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.9008238315582275, + "num_tokens": 511453826.0, + "step": 14035 + }, + { + "epoch": 2.6064995357474467, + "grad_norm": 1.6135255098342896, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.8973774909973145, + "num_tokens": 511488824.0, + "step": 14036 + }, + { + "epoch": 2.6066852367688025, + "grad_norm": 1.5532031059265137, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8840411901473999, + "num_tokens": 511527504.0, + "step": 14037 + }, + { + "epoch": 2.6068709377901578, + "grad_norm": 1.6544770002365112, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8850513696670532, + "num_tokens": 511562892.0, + "step": 14038 + }, + { + "epoch": 2.6070566388115135, + "grad_norm": 1.699786901473999, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8900985717773438, + "num_tokens": 511594602.0, + "step": 14039 + }, + { + "epoch": 2.6072423398328692, + "grad_norm": 1.586888074874878, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8927431106567383, + "num_tokens": 511631868.0, + "step": 14040 + }, + { + "epoch": 2.6074280408542245, + "grad_norm": 1.604241132736206, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8844600915908813, + "num_tokens": 511668676.0, + "step": 14041 + }, + { + "epoch": 2.6076137418755803, + "grad_norm": 1.6765174865722656, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8794796466827393, + "num_tokens": 511702768.0, + "step": 14042 + }, + { + "epoch": 2.607799442896936, + "grad_norm": 1.517751693725586, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8812243938446045, + "num_tokens": 511744594.0, + "step": 14043 + }, + { + "epoch": 2.6079851439182917, + "grad_norm": 1.448435664176941, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8947901725769043, + "num_tokens": 511786203.0, + "step": 14044 + }, + { + "epoch": 2.608170844939647, + "grad_norm": 1.6199229955673218, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8731149435043335, + "num_tokens": 511824306.0, + "step": 14045 + }, + { + "epoch": 2.6083565459610027, + "grad_norm": 1.510388731956482, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8904441595077515, + "num_tokens": 511866809.0, + "step": 14046 + }, + { + "epoch": 2.6085422469823585, + "grad_norm": 1.6732381582260132, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8821447491645813, + "num_tokens": 511906361.0, + "step": 14047 + }, + { + "epoch": 2.6087279480037138, + "grad_norm": 1.6137737035751343, + "learning_rate": 1e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.902489960193634, + "num_tokens": 511940735.0, + "step": 14048 + }, + { + "epoch": 2.6089136490250695, + "grad_norm": 1.591789722442627, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8888980150222778, + "num_tokens": 511980069.0, + "step": 14049 + }, + { + "epoch": 2.6090993500464252, + "grad_norm": 1.7158616781234741, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8679656982421875, + "num_tokens": 512013968.0, + "step": 14050 + }, + { + "epoch": 2.609285051067781, + "grad_norm": 1.6072094440460205, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8874434232711792, + "num_tokens": 512051531.0, + "step": 14051 + }, + { + "epoch": 2.6094707520891367, + "grad_norm": 1.578920602798462, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8909124135971069, + "num_tokens": 512090196.0, + "step": 14052 + }, + { + "epoch": 2.609656453110492, + "grad_norm": 1.6258277893066406, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8763917088508606, + "num_tokens": 512130549.0, + "step": 14053 + }, + { + "epoch": 2.6098421541318477, + "grad_norm": 1.6744928359985352, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8931689262390137, + "num_tokens": 512164687.0, + "step": 14054 + }, + { + "epoch": 2.6100278551532035, + "grad_norm": 1.585344910621643, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8955938816070557, + "num_tokens": 512199764.0, + "step": 14055 + }, + { + "epoch": 2.6102135561745587, + "grad_norm": 1.7463549375534058, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8753429651260376, + "num_tokens": 512236036.0, + "step": 14056 + }, + { + "epoch": 2.6103992571959145, + "grad_norm": 1.7180449962615967, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8897936940193176, + "num_tokens": 512270265.0, + "step": 14057 + }, + { + "epoch": 2.61058495821727, + "grad_norm": 1.4441741704940796, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.8993852734565735, + "num_tokens": 512306632.0, + "step": 14058 + }, + { + "epoch": 2.610770659238626, + "grad_norm": 1.60365891456604, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8787864446640015, + "num_tokens": 512342347.0, + "step": 14059 + }, + { + "epoch": 2.6109563602599817, + "grad_norm": 1.6129064559936523, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8831907510757446, + "num_tokens": 512377892.0, + "step": 14060 + }, + { + "epoch": 2.611142061281337, + "grad_norm": 1.6306819915771484, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.883148729801178, + "num_tokens": 512412605.0, + "step": 14061 + }, + { + "epoch": 2.6113277623026927, + "grad_norm": 1.5303493738174438, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8934382200241089, + "num_tokens": 512450226.0, + "step": 14062 + }, + { + "epoch": 2.6115134633240484, + "grad_norm": 1.6324743032455444, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8875807523727417, + "num_tokens": 512486491.0, + "step": 14063 + }, + { + "epoch": 2.6116991643454037, + "grad_norm": 1.561333179473877, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8804686069488525, + "num_tokens": 512526251.0, + "step": 14064 + }, + { + "epoch": 2.6118848653667595, + "grad_norm": 1.7444106340408325, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8803614974021912, + "num_tokens": 512561370.0, + "step": 14065 + }, + { + "epoch": 2.612070566388115, + "grad_norm": 1.53953218460083, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8860080242156982, + "num_tokens": 512598905.0, + "step": 14066 + }, + { + "epoch": 2.612256267409471, + "grad_norm": 1.7463743686676025, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8813706636428833, + "num_tokens": 512635094.0, + "step": 14067 + }, + { + "epoch": 2.612441968430826, + "grad_norm": 1.525762677192688, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8819090127944946, + "num_tokens": 512676617.0, + "step": 14068 + }, + { + "epoch": 2.612627669452182, + "grad_norm": 1.5428508520126343, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8924365043640137, + "num_tokens": 512715090.0, + "step": 14069 + }, + { + "epoch": 2.6128133704735377, + "grad_norm": 1.7085773944854736, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8897631764411926, + "num_tokens": 512747488.0, + "step": 14070 + }, + { + "epoch": 2.612999071494893, + "grad_norm": 1.6880069971084595, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.876563310623169, + "num_tokens": 512781934.0, + "step": 14071 + }, + { + "epoch": 2.6131847725162487, + "grad_norm": 1.5303871631622314, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8922927379608154, + "num_tokens": 512821107.0, + "step": 14072 + }, + { + "epoch": 2.6133704735376044, + "grad_norm": 1.628879189491272, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8868826627731323, + "num_tokens": 512856739.0, + "step": 14073 + }, + { + "epoch": 2.61355617455896, + "grad_norm": 1.70222008228302, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8854349851608276, + "num_tokens": 512893052.0, + "step": 14074 + }, + { + "epoch": 2.613741875580316, + "grad_norm": 1.6659111976623535, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8914258480072021, + "num_tokens": 512927557.0, + "step": 14075 + }, + { + "epoch": 2.613927576601671, + "grad_norm": 1.5070042610168457, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8841733932495117, + "num_tokens": 512969565.0, + "step": 14076 + }, + { + "epoch": 2.614113277623027, + "grad_norm": 1.7357455492019653, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8907674551010132, + "num_tokens": 513005842.0, + "step": 14077 + }, + { + "epoch": 2.6142989786443827, + "grad_norm": 1.6035574674606323, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8774038553237915, + "num_tokens": 513042394.0, + "step": 14078 + }, + { + "epoch": 2.614484679665738, + "grad_norm": 1.7090147733688354, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8811236619949341, + "num_tokens": 513077742.0, + "step": 14079 + }, + { + "epoch": 2.6146703806870937, + "grad_norm": 1.502098798751831, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8843077421188354, + "num_tokens": 513116332.0, + "step": 14080 + }, + { + "epoch": 2.6148560817084494, + "grad_norm": 1.7400928735733032, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8850740194320679, + "num_tokens": 513147323.0, + "step": 14081 + }, + { + "epoch": 2.615041782729805, + "grad_norm": 1.5755287408828735, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8647034168243408, + "num_tokens": 513187332.0, + "step": 14082 + }, + { + "epoch": 2.615227483751161, + "grad_norm": 1.6645946502685547, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8916803598403931, + "num_tokens": 513218921.0, + "step": 14083 + }, + { + "epoch": 2.615413184772516, + "grad_norm": 1.5718656778335571, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8842706680297852, + "num_tokens": 513257322.0, + "step": 14084 + }, + { + "epoch": 2.615598885793872, + "grad_norm": 1.676493525505066, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8826983571052551, + "num_tokens": 513292533.0, + "step": 14085 + }, + { + "epoch": 2.6157845868152276, + "grad_norm": 1.6280375719070435, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8811609745025635, + "num_tokens": 513328712.0, + "step": 14086 + }, + { + "epoch": 2.615970287836583, + "grad_norm": 1.5279265642166138, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8775455951690674, + "num_tokens": 513368422.0, + "step": 14087 + }, + { + "epoch": 2.6161559888579387, + "grad_norm": 1.5262696743011475, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8860503435134888, + "num_tokens": 513405380.0, + "step": 14088 + }, + { + "epoch": 2.6163416898792944, + "grad_norm": 1.8048999309539795, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8707119226455688, + "num_tokens": 513436446.0, + "step": 14089 + }, + { + "epoch": 2.61652739090065, + "grad_norm": 1.6846367120742798, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.8983272910118103, + "num_tokens": 513468424.0, + "step": 14090 + }, + { + "epoch": 2.6167130919220054, + "grad_norm": 1.5787981748580933, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8880202174186707, + "num_tokens": 513506693.0, + "step": 14091 + }, + { + "epoch": 2.616898792943361, + "grad_norm": 1.6715552806854248, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8767699599266052, + "num_tokens": 513538645.0, + "step": 14092 + }, + { + "epoch": 2.617084493964717, + "grad_norm": 1.5916454792022705, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8850014209747314, + "num_tokens": 513575933.0, + "step": 14093 + }, + { + "epoch": 2.617270194986072, + "grad_norm": 1.8516281843185425, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8787996172904968, + "num_tokens": 513605796.0, + "step": 14094 + }, + { + "epoch": 2.617455896007428, + "grad_norm": 1.5016818046569824, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8856920599937439, + "num_tokens": 513643630.0, + "step": 14095 + }, + { + "epoch": 2.6176415970287836, + "grad_norm": 1.7188587188720703, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8781303763389587, + "num_tokens": 513677954.0, + "step": 14096 + }, + { + "epoch": 2.6178272980501394, + "grad_norm": 1.5866706371307373, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8813127279281616, + "num_tokens": 513716549.0, + "step": 14097 + }, + { + "epoch": 2.618012999071495, + "grad_norm": 1.5389814376831055, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8972983360290527, + "num_tokens": 513752405.0, + "step": 14098 + }, + { + "epoch": 2.6181987000928504, + "grad_norm": 1.7618300914764404, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.882201075553894, + "num_tokens": 513785436.0, + "step": 14099 + }, + { + "epoch": 2.618384401114206, + "grad_norm": 1.518370270729065, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8903887271881104, + "num_tokens": 513825174.0, + "step": 14100 + }, + { + "epoch": 2.618570102135562, + "grad_norm": 1.5186973810195923, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8910481333732605, + "num_tokens": 513863335.0, + "step": 14101 + }, + { + "epoch": 2.618755803156917, + "grad_norm": 1.6176375150680542, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.882317304611206, + "num_tokens": 513901260.0, + "step": 14102 + }, + { + "epoch": 2.618941504178273, + "grad_norm": 1.6030309200286865, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8964170217514038, + "num_tokens": 513934210.0, + "step": 14103 + }, + { + "epoch": 2.6191272051996286, + "grad_norm": 1.5971672534942627, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.885442852973938, + "num_tokens": 513971592.0, + "step": 14104 + }, + { + "epoch": 2.6193129062209843, + "grad_norm": 1.6620912551879883, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.882929801940918, + "num_tokens": 514005815.0, + "step": 14105 + }, + { + "epoch": 2.61949860724234, + "grad_norm": 1.5410244464874268, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8909294605255127, + "num_tokens": 514045316.0, + "step": 14106 + }, + { + "epoch": 2.6196843082636954, + "grad_norm": 1.4386322498321533, + "learning_rate": 1e-06, + "loss": 0.2683, + "mean_token_accuracy": 0.9021607637405396, + "num_tokens": 514082957.0, + "step": 14107 + }, + { + "epoch": 2.619870009285051, + "grad_norm": 1.6023650169372559, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8768134117126465, + "num_tokens": 514121264.0, + "step": 14108 + }, + { + "epoch": 2.620055710306407, + "grad_norm": 1.6770237684249878, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8689046502113342, + "num_tokens": 514157878.0, + "step": 14109 + }, + { + "epoch": 2.620241411327762, + "grad_norm": 1.684963345527649, + "learning_rate": 1e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.9000347256660461, + "num_tokens": 514187887.0, + "step": 14110 + }, + { + "epoch": 2.620427112349118, + "grad_norm": 1.542053461074829, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8973891735076904, + "num_tokens": 514229251.0, + "step": 14111 + }, + { + "epoch": 2.6206128133704736, + "grad_norm": 1.6836292743682861, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8871454000473022, + "num_tokens": 514262115.0, + "step": 14112 + }, + { + "epoch": 2.6207985143918293, + "grad_norm": 1.525856614112854, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8899771571159363, + "num_tokens": 514300836.0, + "step": 14113 + }, + { + "epoch": 2.620984215413185, + "grad_norm": 1.5326299667358398, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8858135938644409, + "num_tokens": 514338938.0, + "step": 14114 + }, + { + "epoch": 2.6211699164345403, + "grad_norm": 1.5906977653503418, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8849289417266846, + "num_tokens": 514377029.0, + "step": 14115 + }, + { + "epoch": 2.621355617455896, + "grad_norm": 1.5757319927215576, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8863703012466431, + "num_tokens": 514415920.0, + "step": 14116 + }, + { + "epoch": 2.6215413184772514, + "grad_norm": 1.4716784954071045, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8849432468414307, + "num_tokens": 514458993.0, + "step": 14117 + }, + { + "epoch": 2.621727019498607, + "grad_norm": 1.5211126804351807, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8914934992790222, + "num_tokens": 514493936.0, + "step": 14118 + }, + { + "epoch": 2.621912720519963, + "grad_norm": 1.5621999502182007, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8824272751808167, + "num_tokens": 514534663.0, + "step": 14119 + }, + { + "epoch": 2.6220984215413186, + "grad_norm": 1.5092123746871948, + "learning_rate": 1e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.8981139659881592, + "num_tokens": 514572183.0, + "step": 14120 + }, + { + "epoch": 2.6222841225626743, + "grad_norm": 1.5343313217163086, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8895353078842163, + "num_tokens": 514609633.0, + "step": 14121 + }, + { + "epoch": 2.6224698235840296, + "grad_norm": 1.6550885438919067, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8899394273757935, + "num_tokens": 514644929.0, + "step": 14122 + }, + { + "epoch": 2.6226555246053853, + "grad_norm": 1.5525248050689697, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8760315179824829, + "num_tokens": 514686089.0, + "step": 14123 + }, + { + "epoch": 2.622841225626741, + "grad_norm": 1.5362045764923096, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8898633122444153, + "num_tokens": 514722469.0, + "step": 14124 + }, + { + "epoch": 2.6230269266480963, + "grad_norm": 1.5246851444244385, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8948301076889038, + "num_tokens": 514761368.0, + "step": 14125 + }, + { + "epoch": 2.623212627669452, + "grad_norm": 1.5185571908950806, + "learning_rate": 1e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.9002193212509155, + "num_tokens": 514799824.0, + "step": 14126 + }, + { + "epoch": 2.623398328690808, + "grad_norm": 1.5974736213684082, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8853794932365417, + "num_tokens": 514835559.0, + "step": 14127 + }, + { + "epoch": 2.6235840297121635, + "grad_norm": 1.585540533065796, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8761878609657288, + "num_tokens": 514876215.0, + "step": 14128 + }, + { + "epoch": 2.6237697307335193, + "grad_norm": 1.65994131565094, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8855693936347961, + "num_tokens": 514908287.0, + "step": 14129 + }, + { + "epoch": 2.6239554317548746, + "grad_norm": 1.611338496208191, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8888285160064697, + "num_tokens": 514943346.0, + "step": 14130 + }, + { + "epoch": 2.6241411327762303, + "grad_norm": 1.6271076202392578, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8832587599754333, + "num_tokens": 514980652.0, + "step": 14131 + }, + { + "epoch": 2.624326833797586, + "grad_norm": 1.6689642667770386, + "learning_rate": 1e-06, + "loss": 0.2746, + "mean_token_accuracy": 0.8970530033111572, + "num_tokens": 515011557.0, + "step": 14132 + }, + { + "epoch": 2.6245125348189413, + "grad_norm": 1.5354838371276855, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.900458574295044, + "num_tokens": 515048384.0, + "step": 14133 + }, + { + "epoch": 2.624698235840297, + "grad_norm": 1.4980833530426025, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8836922645568848, + "num_tokens": 515091641.0, + "step": 14134 + }, + { + "epoch": 2.6248839368616528, + "grad_norm": 1.668487787246704, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.883384108543396, + "num_tokens": 515124513.0, + "step": 14135 + }, + { + "epoch": 2.6250696378830085, + "grad_norm": 1.711922526359558, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8810197710990906, + "num_tokens": 515157467.0, + "step": 14136 + }, + { + "epoch": 2.6252553389043642, + "grad_norm": 1.6621628999710083, + "learning_rate": 1e-06, + "loss": 0.2698, + "mean_token_accuracy": 0.9022936224937439, + "num_tokens": 515191330.0, + "step": 14137 + }, + { + "epoch": 2.6254410399257195, + "grad_norm": 1.4412578344345093, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.886876106262207, + "num_tokens": 515234818.0, + "step": 14138 + }, + { + "epoch": 2.6256267409470753, + "grad_norm": 1.7883738279342651, + "learning_rate": 1e-06, + "loss": 0.2789, + "mean_token_accuracy": 0.8982082605361938, + "num_tokens": 515264173.0, + "step": 14139 + }, + { + "epoch": 2.6258124419684306, + "grad_norm": 1.6727758646011353, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8860803842544556, + "num_tokens": 515302992.0, + "step": 14140 + }, + { + "epoch": 2.6259981429897863, + "grad_norm": 1.5001537799835205, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8838233947753906, + "num_tokens": 515341850.0, + "step": 14141 + }, + { + "epoch": 2.626183844011142, + "grad_norm": 1.5624586343765259, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8849674463272095, + "num_tokens": 515382058.0, + "step": 14142 + }, + { + "epoch": 2.6263695450324978, + "grad_norm": 1.6465903520584106, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8864133954048157, + "num_tokens": 515416491.0, + "step": 14143 + }, + { + "epoch": 2.6265552460538535, + "grad_norm": 1.5827125310897827, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8941582441329956, + "num_tokens": 515449896.0, + "step": 14144 + }, + { + "epoch": 2.6267409470752088, + "grad_norm": 1.5492503643035889, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8928121328353882, + "num_tokens": 515487573.0, + "step": 14145 + }, + { + "epoch": 2.6269266480965645, + "grad_norm": 1.638051986694336, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8884153366088867, + "num_tokens": 515521420.0, + "step": 14146 + }, + { + "epoch": 2.6271123491179202, + "grad_norm": 1.6427583694458008, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8898031711578369, + "num_tokens": 515554287.0, + "step": 14147 + }, + { + "epoch": 2.6272980501392755, + "grad_norm": 1.571718454360962, + "learning_rate": 1e-06, + "loss": 0.2724, + "mean_token_accuracy": 0.8992747068405151, + "num_tokens": 515587456.0, + "step": 14148 + }, + { + "epoch": 2.6274837511606313, + "grad_norm": 1.5937693119049072, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.875749945640564, + "num_tokens": 515629357.0, + "step": 14149 + }, + { + "epoch": 2.627669452181987, + "grad_norm": 1.6208994388580322, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8930009603500366, + "num_tokens": 515667736.0, + "step": 14150 + }, + { + "epoch": 2.6278551532033427, + "grad_norm": 1.553193211555481, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8784166574478149, + "num_tokens": 515706146.0, + "step": 14151 + }, + { + "epoch": 2.6280408542246985, + "grad_norm": 1.5859004259109497, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8842119574546814, + "num_tokens": 515747423.0, + "step": 14152 + }, + { + "epoch": 2.6282265552460538, + "grad_norm": 1.632071614265442, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8929556608200073, + "num_tokens": 515782056.0, + "step": 14153 + }, + { + "epoch": 2.6284122562674095, + "grad_norm": 1.7575641870498657, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8750391006469727, + "num_tokens": 515815674.0, + "step": 14154 + }, + { + "epoch": 2.628597957288765, + "grad_norm": 1.580842137336731, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8878506422042847, + "num_tokens": 515856980.0, + "step": 14155 + }, + { + "epoch": 2.6287836583101205, + "grad_norm": 1.5400208234786987, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8898038864135742, + "num_tokens": 515898846.0, + "step": 14156 + }, + { + "epoch": 2.6289693593314762, + "grad_norm": 1.624714970588684, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8862572908401489, + "num_tokens": 515935795.0, + "step": 14157 + }, + { + "epoch": 2.629155060352832, + "grad_norm": 1.7229113578796387, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8787117600440979, + "num_tokens": 515971758.0, + "step": 14158 + }, + { + "epoch": 2.6293407613741877, + "grad_norm": 1.6816669702529907, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8792811036109924, + "num_tokens": 516006911.0, + "step": 14159 + }, + { + "epoch": 2.6295264623955434, + "grad_norm": 1.655043601989746, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8863608241081238, + "num_tokens": 516043519.0, + "step": 14160 + }, + { + "epoch": 2.6297121634168987, + "grad_norm": 1.5654364824295044, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8834930658340454, + "num_tokens": 516086291.0, + "step": 14161 + }, + { + "epoch": 2.6298978644382545, + "grad_norm": 1.874422311782837, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8663512468338013, + "num_tokens": 516123518.0, + "step": 14162 + }, + { + "epoch": 2.6300835654596098, + "grad_norm": 1.7265745401382446, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.89268958568573, + "num_tokens": 516160642.0, + "step": 14163 + }, + { + "epoch": 2.6302692664809655, + "grad_norm": 1.528180480003357, + "learning_rate": 1e-06, + "loss": 0.2602, + "mean_token_accuracy": 0.904358983039856, + "num_tokens": 516194552.0, + "step": 14164 + }, + { + "epoch": 2.630454967502321, + "grad_norm": 1.7908636331558228, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8953648805618286, + "num_tokens": 516227746.0, + "step": 14165 + }, + { + "epoch": 2.630640668523677, + "grad_norm": 1.7084628343582153, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8928478360176086, + "num_tokens": 516259559.0, + "step": 14166 + }, + { + "epoch": 2.6308263695450327, + "grad_norm": 1.5555024147033691, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8831977844238281, + "num_tokens": 516300515.0, + "step": 14167 + }, + { + "epoch": 2.631012070566388, + "grad_norm": 1.6155116558074951, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8832578063011169, + "num_tokens": 516343450.0, + "step": 14168 + }, + { + "epoch": 2.6311977715877437, + "grad_norm": 1.5053287744522095, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8802658915519714, + "num_tokens": 516385931.0, + "step": 14169 + }, + { + "epoch": 2.6313834726090994, + "grad_norm": 1.7406214475631714, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8908175230026245, + "num_tokens": 516421370.0, + "step": 14170 + }, + { + "epoch": 2.6315691736304547, + "grad_norm": 1.665172815322876, + "learning_rate": 1e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9001467227935791, + "num_tokens": 516453599.0, + "step": 14171 + }, + { + "epoch": 2.6317548746518105, + "grad_norm": 1.527153730392456, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8862006664276123, + "num_tokens": 516494302.0, + "step": 14172 + }, + { + "epoch": 2.631940575673166, + "grad_norm": 1.6331852674484253, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8855315446853638, + "num_tokens": 516531469.0, + "step": 14173 + }, + { + "epoch": 2.632126276694522, + "grad_norm": 1.6714459657669067, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8776159286499023, + "num_tokens": 516564153.0, + "step": 14174 + }, + { + "epoch": 2.6323119777158777, + "grad_norm": 1.746775507926941, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.885871410369873, + "num_tokens": 516597769.0, + "step": 14175 + }, + { + "epoch": 2.632497678737233, + "grad_norm": 1.5711201429367065, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8918402791023254, + "num_tokens": 516635813.0, + "step": 14176 + }, + { + "epoch": 2.6326833797585887, + "grad_norm": 1.485790491104126, + "learning_rate": 1e-06, + "loss": 0.2775, + "mean_token_accuracy": 0.8989179134368896, + "num_tokens": 516675554.0, + "step": 14177 + }, + { + "epoch": 2.6328690807799444, + "grad_norm": 1.6251686811447144, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8715471625328064, + "num_tokens": 516715053.0, + "step": 14178 + }, + { + "epoch": 2.6330547818012997, + "grad_norm": 1.5405789613723755, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8903046250343323, + "num_tokens": 516751132.0, + "step": 14179 + }, + { + "epoch": 2.6332404828226554, + "grad_norm": 1.624768853187561, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8728256821632385, + "num_tokens": 516788707.0, + "step": 14180 + }, + { + "epoch": 2.633426183844011, + "grad_norm": 1.6590615510940552, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8887053728103638, + "num_tokens": 516822068.0, + "step": 14181 + }, + { + "epoch": 2.633611884865367, + "grad_norm": 1.5555155277252197, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8890945911407471, + "num_tokens": 516859669.0, + "step": 14182 + }, + { + "epoch": 2.6337975858867226, + "grad_norm": 1.626859188079834, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8952495455741882, + "num_tokens": 516893283.0, + "step": 14183 + }, + { + "epoch": 2.633983286908078, + "grad_norm": 1.5243079662322998, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8817643523216248, + "num_tokens": 516935358.0, + "step": 14184 + }, + { + "epoch": 2.6341689879294337, + "grad_norm": 1.571778655052185, + "learning_rate": 1e-06, + "loss": 0.2796, + "mean_token_accuracy": 0.8987550735473633, + "num_tokens": 516968943.0, + "step": 14185 + }, + { + "epoch": 2.6343546889507894, + "grad_norm": 1.6820805072784424, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8830861449241638, + "num_tokens": 517004750.0, + "step": 14186 + }, + { + "epoch": 2.6345403899721447, + "grad_norm": 1.5958222150802612, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8813645839691162, + "num_tokens": 517043599.0, + "step": 14187 + }, + { + "epoch": 2.6347260909935004, + "grad_norm": 1.705386996269226, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8773621320724487, + "num_tokens": 517079088.0, + "step": 14188 + }, + { + "epoch": 2.634911792014856, + "grad_norm": 1.540144920349121, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8918291330337524, + "num_tokens": 517115750.0, + "step": 14189 + }, + { + "epoch": 2.635097493036212, + "grad_norm": 1.5963687896728516, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8792417049407959, + "num_tokens": 517154181.0, + "step": 14190 + }, + { + "epoch": 2.635283194057567, + "grad_norm": 1.5954207181930542, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8943789601325989, + "num_tokens": 517189815.0, + "step": 14191 + }, + { + "epoch": 2.635468895078923, + "grad_norm": 1.5881390571594238, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8760632276535034, + "num_tokens": 517229154.0, + "step": 14192 + }, + { + "epoch": 2.6356545961002786, + "grad_norm": 1.566489577293396, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.8998023271560669, + "num_tokens": 517264573.0, + "step": 14193 + }, + { + "epoch": 2.635840297121634, + "grad_norm": 1.5274690389633179, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8870519995689392, + "num_tokens": 517304193.0, + "step": 14194 + }, + { + "epoch": 2.6360259981429897, + "grad_norm": 1.502150297164917, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.889517068862915, + "num_tokens": 517345363.0, + "step": 14195 + }, + { + "epoch": 2.6362116991643454, + "grad_norm": 1.5526223182678223, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8916463851928711, + "num_tokens": 517379797.0, + "step": 14196 + }, + { + "epoch": 2.636397400185701, + "grad_norm": 1.58039128780365, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.895829439163208, + "num_tokens": 517415083.0, + "step": 14197 + }, + { + "epoch": 2.636583101207057, + "grad_norm": 1.5237711668014526, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8878055810928345, + "num_tokens": 517453865.0, + "step": 14198 + }, + { + "epoch": 2.636768802228412, + "grad_norm": 1.5077438354492188, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.897675096988678, + "num_tokens": 517492802.0, + "step": 14199 + }, + { + "epoch": 2.636954503249768, + "grad_norm": 1.9124168157577515, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8919522166252136, + "num_tokens": 517522871.0, + "step": 14200 + }, + { + "epoch": 2.6371402042711236, + "grad_norm": 1.7938231229782104, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.879159688949585, + "num_tokens": 517558037.0, + "step": 14201 + }, + { + "epoch": 2.637325905292479, + "grad_norm": 1.6434545516967773, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8907778859138489, + "num_tokens": 517593610.0, + "step": 14202 + }, + { + "epoch": 2.6375116063138346, + "grad_norm": 1.630298137664795, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8849021196365356, + "num_tokens": 517629123.0, + "step": 14203 + }, + { + "epoch": 2.6376973073351904, + "grad_norm": 1.704854965209961, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8881505727767944, + "num_tokens": 517661893.0, + "step": 14204 + }, + { + "epoch": 2.637883008356546, + "grad_norm": 1.5721772909164429, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8926365375518799, + "num_tokens": 517700504.0, + "step": 14205 + }, + { + "epoch": 2.638068709377902, + "grad_norm": 1.5840816497802734, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8870671987533569, + "num_tokens": 517736570.0, + "step": 14206 + }, + { + "epoch": 2.638254410399257, + "grad_norm": 1.6477563381195068, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8904557228088379, + "num_tokens": 517768882.0, + "step": 14207 + }, + { + "epoch": 2.638440111420613, + "grad_norm": 1.672362208366394, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8865047693252563, + "num_tokens": 517803390.0, + "step": 14208 + }, + { + "epoch": 2.6386258124419686, + "grad_norm": 1.66019868850708, + "learning_rate": 1e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.9001405239105225, + "num_tokens": 517837115.0, + "step": 14209 + }, + { + "epoch": 2.638811513463324, + "grad_norm": 1.5999852418899536, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.8988901376724243, + "num_tokens": 517874326.0, + "step": 14210 + }, + { + "epoch": 2.6389972144846796, + "grad_norm": 1.6330119371414185, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8809993267059326, + "num_tokens": 517913433.0, + "step": 14211 + }, + { + "epoch": 2.6391829155060353, + "grad_norm": 1.7213406562805176, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8625611066818237, + "num_tokens": 517951413.0, + "step": 14212 + }, + { + "epoch": 2.639368616527391, + "grad_norm": 1.7019670009613037, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8828949928283691, + "num_tokens": 517985057.0, + "step": 14213 + }, + { + "epoch": 2.6395543175487464, + "grad_norm": 1.542978048324585, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8840742111206055, + "num_tokens": 518027518.0, + "step": 14214 + }, + { + "epoch": 2.639740018570102, + "grad_norm": 1.774530053138733, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8945814371109009, + "num_tokens": 518061244.0, + "step": 14215 + }, + { + "epoch": 2.639925719591458, + "grad_norm": 1.7102116346359253, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8827807903289795, + "num_tokens": 518097864.0, + "step": 14216 + }, + { + "epoch": 2.640111420612813, + "grad_norm": 1.4998552799224854, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8911740779876709, + "num_tokens": 518138638.0, + "step": 14217 + }, + { + "epoch": 2.640297121634169, + "grad_norm": 1.6491305828094482, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8984616994857788, + "num_tokens": 518174838.0, + "step": 14218 + }, + { + "epoch": 2.6404828226555246, + "grad_norm": 1.552984595298767, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8958628177642822, + "num_tokens": 518209325.0, + "step": 14219 + }, + { + "epoch": 2.6406685236768803, + "grad_norm": 1.8390876054763794, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8830020427703857, + "num_tokens": 518240480.0, + "step": 14220 + }, + { + "epoch": 2.640854224698236, + "grad_norm": 1.6264538764953613, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8737322688102722, + "num_tokens": 518282732.0, + "step": 14221 + }, + { + "epoch": 2.6410399257195913, + "grad_norm": 1.6578121185302734, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8846460580825806, + "num_tokens": 518317844.0, + "step": 14222 + }, + { + "epoch": 2.641225626740947, + "grad_norm": 1.7645342350006104, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8802987933158875, + "num_tokens": 518348207.0, + "step": 14223 + }, + { + "epoch": 2.641411327762303, + "grad_norm": 1.7255375385284424, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8855565190315247, + "num_tokens": 518386872.0, + "step": 14224 + }, + { + "epoch": 2.641597028783658, + "grad_norm": 1.7124589681625366, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8839260339736938, + "num_tokens": 518424979.0, + "step": 14225 + }, + { + "epoch": 2.641782729805014, + "grad_norm": 1.5480842590332031, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.8986244201660156, + "num_tokens": 518461431.0, + "step": 14226 + }, + { + "epoch": 2.6419684308263696, + "grad_norm": 1.5550448894500732, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8901188373565674, + "num_tokens": 518498818.0, + "step": 14227 + }, + { + "epoch": 2.6421541318477253, + "grad_norm": 1.4930615425109863, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8932512998580933, + "num_tokens": 518539628.0, + "step": 14228 + }, + { + "epoch": 2.642339832869081, + "grad_norm": 1.6532106399536133, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8819177746772766, + "num_tokens": 518577703.0, + "step": 14229 + }, + { + "epoch": 2.6425255338904363, + "grad_norm": 1.6800906658172607, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8942773938179016, + "num_tokens": 518615189.0, + "step": 14230 + }, + { + "epoch": 2.642711234911792, + "grad_norm": 1.7834261655807495, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8768627047538757, + "num_tokens": 518649302.0, + "step": 14231 + }, + { + "epoch": 2.642896935933148, + "grad_norm": 1.7338300943374634, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8952680826187134, + "num_tokens": 518684674.0, + "step": 14232 + }, + { + "epoch": 2.643082636954503, + "grad_norm": 1.4962258338928223, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8843386769294739, + "num_tokens": 518725416.0, + "step": 14233 + }, + { + "epoch": 2.643268337975859, + "grad_norm": 1.5168559551239014, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8952863812446594, + "num_tokens": 518761521.0, + "step": 14234 + }, + { + "epoch": 2.6434540389972145, + "grad_norm": 1.8178505897521973, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8795753121376038, + "num_tokens": 518798635.0, + "step": 14235 + }, + { + "epoch": 2.6436397400185703, + "grad_norm": 1.5964733362197876, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8940364122390747, + "num_tokens": 518839377.0, + "step": 14236 + }, + { + "epoch": 2.6438254410399256, + "grad_norm": 1.6803845167160034, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.87732994556427, + "num_tokens": 518880010.0, + "step": 14237 + }, + { + "epoch": 2.6440111420612813, + "grad_norm": 1.6806732416152954, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8841507434844971, + "num_tokens": 518922778.0, + "step": 14238 + }, + { + "epoch": 2.644196843082637, + "grad_norm": 1.8987475633621216, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8641040325164795, + "num_tokens": 518957935.0, + "step": 14239 + }, + { + "epoch": 2.6443825441039923, + "grad_norm": 1.5701453685760498, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8816191554069519, + "num_tokens": 518997587.0, + "step": 14240 + }, + { + "epoch": 2.644568245125348, + "grad_norm": 1.5781569480895996, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8874940872192383, + "num_tokens": 519032502.0, + "step": 14241 + }, + { + "epoch": 2.644753946146704, + "grad_norm": 1.7374112606048584, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8856642246246338, + "num_tokens": 519067762.0, + "step": 14242 + }, + { + "epoch": 2.6449396471680595, + "grad_norm": 1.7043124437332153, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8889106512069702, + "num_tokens": 519102471.0, + "step": 14243 + }, + { + "epoch": 2.6451253481894152, + "grad_norm": 1.5668315887451172, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8939214944839478, + "num_tokens": 519140579.0, + "step": 14244 + }, + { + "epoch": 2.6453110492107705, + "grad_norm": 1.6027984619140625, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8938788771629333, + "num_tokens": 519176051.0, + "step": 14245 + }, + { + "epoch": 2.6454967502321263, + "grad_norm": 1.917932391166687, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8760216236114502, + "num_tokens": 519209513.0, + "step": 14246 + }, + { + "epoch": 2.645682451253482, + "grad_norm": 1.5709211826324463, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8939013481140137, + "num_tokens": 519246832.0, + "step": 14247 + }, + { + "epoch": 2.6458681522748373, + "grad_norm": 1.5508697032928467, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8851732611656189, + "num_tokens": 519289422.0, + "step": 14248 + }, + { + "epoch": 2.646053853296193, + "grad_norm": 1.567198395729065, + "learning_rate": 1e-06, + "loss": 0.2732, + "mean_token_accuracy": 0.8977532386779785, + "num_tokens": 519323444.0, + "step": 14249 + }, + { + "epoch": 2.6462395543175488, + "grad_norm": 1.7474822998046875, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8867154121398926, + "num_tokens": 519353708.0, + "step": 14250 + }, + { + "epoch": 2.6464252553389045, + "grad_norm": 1.6230708360671997, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8956161737442017, + "num_tokens": 519384952.0, + "step": 14251 + }, + { + "epoch": 2.6466109563602602, + "grad_norm": 1.561149001121521, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8957923650741577, + "num_tokens": 519421064.0, + "step": 14252 + }, + { + "epoch": 2.6467966573816155, + "grad_norm": 1.6665074825286865, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8877201080322266, + "num_tokens": 519455818.0, + "step": 14253 + }, + { + "epoch": 2.6469823584029712, + "grad_norm": 1.6792833805084229, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8889342546463013, + "num_tokens": 519490307.0, + "step": 14254 + }, + { + "epoch": 2.647168059424327, + "grad_norm": 1.5816378593444824, + "learning_rate": 1e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.8991131782531738, + "num_tokens": 519523717.0, + "step": 14255 + }, + { + "epoch": 2.6473537604456823, + "grad_norm": 1.5470716953277588, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8898472785949707, + "num_tokens": 519561647.0, + "step": 14256 + }, + { + "epoch": 2.647539461467038, + "grad_norm": 1.619251012802124, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8864182233810425, + "num_tokens": 519597338.0, + "step": 14257 + }, + { + "epoch": 2.6477251624883937, + "grad_norm": 1.6587320566177368, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8895086050033569, + "num_tokens": 519633736.0, + "step": 14258 + }, + { + "epoch": 2.6479108635097495, + "grad_norm": 1.63720703125, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.892265260219574, + "num_tokens": 519667268.0, + "step": 14259 + }, + { + "epoch": 2.6480965645311048, + "grad_norm": 1.5179100036621094, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8924307823181152, + "num_tokens": 519704962.0, + "step": 14260 + }, + { + "epoch": 2.6482822655524605, + "grad_norm": 1.8032366037368774, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8852456212043762, + "num_tokens": 519738122.0, + "step": 14261 + }, + { + "epoch": 2.6484679665738162, + "grad_norm": 1.645482063293457, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8837669491767883, + "num_tokens": 519774195.0, + "step": 14262 + }, + { + "epoch": 2.6486536675951715, + "grad_norm": 1.8046050071716309, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8950806856155396, + "num_tokens": 519802079.0, + "step": 14263 + }, + { + "epoch": 2.6488393686165272, + "grad_norm": 1.6539244651794434, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8913534283638, + "num_tokens": 519836565.0, + "step": 14264 + }, + { + "epoch": 2.649025069637883, + "grad_norm": 1.5436080694198608, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8863775730133057, + "num_tokens": 519875899.0, + "step": 14265 + }, + { + "epoch": 2.6492107706592387, + "grad_norm": 1.6545164585113525, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.891811728477478, + "num_tokens": 519909844.0, + "step": 14266 + }, + { + "epoch": 2.6493964716805944, + "grad_norm": 1.5538462400436401, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8848636150360107, + "num_tokens": 519950134.0, + "step": 14267 + }, + { + "epoch": 2.6495821727019497, + "grad_norm": 1.6600441932678223, + "learning_rate": 1e-06, + "loss": 0.2839, + "mean_token_accuracy": 0.8994532823562622, + "num_tokens": 519981076.0, + "step": 14268 + }, + { + "epoch": 2.6497678737233055, + "grad_norm": 1.9222407341003418, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8768531680107117, + "num_tokens": 520012948.0, + "step": 14269 + }, + { + "epoch": 2.649953574744661, + "grad_norm": 1.7364404201507568, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8682061433792114, + "num_tokens": 520047819.0, + "step": 14270 + }, + { + "epoch": 2.6501392757660165, + "grad_norm": 1.6645172834396362, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8852746486663818, + "num_tokens": 520082155.0, + "step": 14271 + }, + { + "epoch": 2.650324976787372, + "grad_norm": 1.510900855064392, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8880584836006165, + "num_tokens": 520120018.0, + "step": 14272 + }, + { + "epoch": 2.650510677808728, + "grad_norm": 1.632333517074585, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8867889642715454, + "num_tokens": 520156976.0, + "step": 14273 + }, + { + "epoch": 2.6506963788300837, + "grad_norm": 1.709632158279419, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.888444185256958, + "num_tokens": 520189816.0, + "step": 14274 + }, + { + "epoch": 2.6508820798514394, + "grad_norm": 1.6585924625396729, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8786302804946899, + "num_tokens": 520224521.0, + "step": 14275 + }, + { + "epoch": 2.6510677808727947, + "grad_norm": 1.4992966651916504, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8820676207542419, + "num_tokens": 520262261.0, + "step": 14276 + }, + { + "epoch": 2.6512534818941504, + "grad_norm": 1.4771071672439575, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8838357925415039, + "num_tokens": 520305727.0, + "step": 14277 + }, + { + "epoch": 2.651439182915506, + "grad_norm": 1.8631724119186401, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8750057220458984, + "num_tokens": 520335737.0, + "step": 14278 + }, + { + "epoch": 2.6516248839368615, + "grad_norm": 1.6984971761703491, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.885265588760376, + "num_tokens": 520372847.0, + "step": 14279 + }, + { + "epoch": 2.651810584958217, + "grad_norm": 1.6632224321365356, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.881277859210968, + "num_tokens": 520408910.0, + "step": 14280 + }, + { + "epoch": 2.651996285979573, + "grad_norm": 1.5482662916183472, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8838458061218262, + "num_tokens": 520449057.0, + "step": 14281 + }, + { + "epoch": 2.6521819870009287, + "grad_norm": 1.6772271394729614, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8824454545974731, + "num_tokens": 520484434.0, + "step": 14282 + }, + { + "epoch": 2.6523676880222844, + "grad_norm": 1.7510398626327515, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8925052881240845, + "num_tokens": 520517729.0, + "step": 14283 + }, + { + "epoch": 2.6525533890436397, + "grad_norm": 1.744028091430664, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8818821907043457, + "num_tokens": 520551487.0, + "step": 14284 + }, + { + "epoch": 2.6527390900649954, + "grad_norm": 1.6845076084136963, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8871749639511108, + "num_tokens": 520586836.0, + "step": 14285 + }, + { + "epoch": 2.6529247910863507, + "grad_norm": 1.6240936517715454, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8821741938591003, + "num_tokens": 520624573.0, + "step": 14286 + }, + { + "epoch": 2.6531104921077064, + "grad_norm": 1.7195537090301514, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8899973630905151, + "num_tokens": 520659539.0, + "step": 14287 + }, + { + "epoch": 2.653296193129062, + "grad_norm": 1.6524187326431274, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8953684568405151, + "num_tokens": 520693137.0, + "step": 14288 + }, + { + "epoch": 2.653481894150418, + "grad_norm": 1.6397333145141602, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8829702734947205, + "num_tokens": 520728619.0, + "step": 14289 + }, + { + "epoch": 2.6536675951717736, + "grad_norm": 1.8172882795333862, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8599833250045776, + "num_tokens": 520763691.0, + "step": 14290 + }, + { + "epoch": 2.653853296193129, + "grad_norm": 1.6768594980239868, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8862146735191345, + "num_tokens": 520795243.0, + "step": 14291 + }, + { + "epoch": 2.6540389972144847, + "grad_norm": 1.7307614088058472, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8717672824859619, + "num_tokens": 520830202.0, + "step": 14292 + }, + { + "epoch": 2.6542246982358404, + "grad_norm": 1.7460107803344727, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8914532661437988, + "num_tokens": 520861064.0, + "step": 14293 + }, + { + "epoch": 2.6544103992571957, + "grad_norm": 1.6107542514801025, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.891248345375061, + "num_tokens": 520898183.0, + "step": 14294 + }, + { + "epoch": 2.6545961002785514, + "grad_norm": 1.6175353527069092, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8859579563140869, + "num_tokens": 520931888.0, + "step": 14295 + }, + { + "epoch": 2.654781801299907, + "grad_norm": 1.5313286781311035, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8846315145492554, + "num_tokens": 520972666.0, + "step": 14296 + }, + { + "epoch": 2.654967502321263, + "grad_norm": 1.4415576457977295, + "learning_rate": 1e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.9035181999206543, + "num_tokens": 521009948.0, + "step": 14297 + }, + { + "epoch": 2.6551532033426186, + "grad_norm": 1.4514060020446777, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.8973461985588074, + "num_tokens": 521047743.0, + "step": 14298 + }, + { + "epoch": 2.655338904363974, + "grad_norm": 1.5825735330581665, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.8976391553878784, + "num_tokens": 521081367.0, + "step": 14299 + }, + { + "epoch": 2.6555246053853296, + "grad_norm": 1.4396783113479614, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8825244307518005, + "num_tokens": 521126574.0, + "step": 14300 + }, + { + "epoch": 2.6557103064066854, + "grad_norm": 1.4680153131484985, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8984577655792236, + "num_tokens": 521164317.0, + "step": 14301 + }, + { + "epoch": 2.6558960074280407, + "grad_norm": 1.5493439435958862, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8900578022003174, + "num_tokens": 521202672.0, + "step": 14302 + }, + { + "epoch": 2.6560817084493964, + "grad_norm": 1.7918413877487183, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8762239217758179, + "num_tokens": 521237764.0, + "step": 14303 + }, + { + "epoch": 2.656267409470752, + "grad_norm": 1.531638264656067, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8923206329345703, + "num_tokens": 521275795.0, + "step": 14304 + }, + { + "epoch": 2.656453110492108, + "grad_norm": 1.604274034500122, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8964561223983765, + "num_tokens": 521308771.0, + "step": 14305 + }, + { + "epoch": 2.6566388115134636, + "grad_norm": 1.681359887123108, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8946484923362732, + "num_tokens": 521340241.0, + "step": 14306 + }, + { + "epoch": 2.656824512534819, + "grad_norm": 1.6599342823028564, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8854811787605286, + "num_tokens": 521375886.0, + "step": 14307 + }, + { + "epoch": 2.6570102135561746, + "grad_norm": 1.733357548713684, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8629233241081238, + "num_tokens": 521412649.0, + "step": 14308 + }, + { + "epoch": 2.65719591457753, + "grad_norm": 1.775465726852417, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8634123206138611, + "num_tokens": 521449155.0, + "step": 14309 + }, + { + "epoch": 2.6573816155988856, + "grad_norm": 1.7316977977752686, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8891815543174744, + "num_tokens": 521483716.0, + "step": 14310 + }, + { + "epoch": 2.6575673166202414, + "grad_norm": 1.734734058380127, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8737181425094604, + "num_tokens": 521520793.0, + "step": 14311 + }, + { + "epoch": 2.657753017641597, + "grad_norm": 1.6909575462341309, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8771597146987915, + "num_tokens": 521557102.0, + "step": 14312 + }, + { + "epoch": 2.657938718662953, + "grad_norm": 1.6428064107894897, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8919075131416321, + "num_tokens": 521591522.0, + "step": 14313 + }, + { + "epoch": 2.658124419684308, + "grad_norm": 1.6121503114700317, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8936138153076172, + "num_tokens": 521627504.0, + "step": 14314 + }, + { + "epoch": 2.658310120705664, + "grad_norm": 1.5586508512496948, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8849791288375854, + "num_tokens": 521665794.0, + "step": 14315 + }, + { + "epoch": 2.6584958217270196, + "grad_norm": 1.6584622859954834, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8869636058807373, + "num_tokens": 521699420.0, + "step": 14316 + }, + { + "epoch": 2.658681522748375, + "grad_norm": 1.658393383026123, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8865853548049927, + "num_tokens": 521733833.0, + "step": 14317 + }, + { + "epoch": 2.6588672237697306, + "grad_norm": 1.5850796699523926, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8942371010780334, + "num_tokens": 521769874.0, + "step": 14318 + }, + { + "epoch": 2.6590529247910863, + "grad_norm": 1.4745526313781738, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8817240595817566, + "num_tokens": 521812581.0, + "step": 14319 + }, + { + "epoch": 2.659238625812442, + "grad_norm": 1.688320517539978, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8784720301628113, + "num_tokens": 521852135.0, + "step": 14320 + }, + { + "epoch": 2.659424326833798, + "grad_norm": 1.5982540845870972, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.892680287361145, + "num_tokens": 521887327.0, + "step": 14321 + }, + { + "epoch": 2.659610027855153, + "grad_norm": 1.5799459218978882, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8826354146003723, + "num_tokens": 521926777.0, + "step": 14322 + }, + { + "epoch": 2.659795728876509, + "grad_norm": 1.61117684841156, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8919454216957092, + "num_tokens": 521961956.0, + "step": 14323 + }, + { + "epoch": 2.6599814298978646, + "grad_norm": 1.5987303256988525, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8940349221229553, + "num_tokens": 522000230.0, + "step": 14324 + }, + { + "epoch": 2.66016713091922, + "grad_norm": 1.5574434995651245, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8873926401138306, + "num_tokens": 522038996.0, + "step": 14325 + }, + { + "epoch": 2.6603528319405756, + "grad_norm": 1.7384685277938843, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.870169997215271, + "num_tokens": 522074362.0, + "step": 14326 + }, + { + "epoch": 2.6605385329619313, + "grad_norm": 1.6479243040084839, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8889550566673279, + "num_tokens": 522109747.0, + "step": 14327 + }, + { + "epoch": 2.660724233983287, + "grad_norm": 1.488366961479187, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8910552263259888, + "num_tokens": 522153295.0, + "step": 14328 + }, + { + "epoch": 2.660909935004643, + "grad_norm": 1.4731848239898682, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8951864242553711, + "num_tokens": 522192705.0, + "step": 14329 + }, + { + "epoch": 2.661095636025998, + "grad_norm": 1.4728529453277588, + "learning_rate": 1e-06, + "loss": 0.279, + "mean_token_accuracy": 0.8994083404541016, + "num_tokens": 522233361.0, + "step": 14330 + }, + { + "epoch": 2.661281337047354, + "grad_norm": 1.565955638885498, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8968713283538818, + "num_tokens": 522270756.0, + "step": 14331 + }, + { + "epoch": 2.661467038068709, + "grad_norm": 1.6015527248382568, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8912546634674072, + "num_tokens": 522305647.0, + "step": 14332 + }, + { + "epoch": 2.661652739090065, + "grad_norm": 1.652185320854187, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.881587028503418, + "num_tokens": 522342126.0, + "step": 14333 + }, + { + "epoch": 2.6618384401114206, + "grad_norm": 1.5843946933746338, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8892549276351929, + "num_tokens": 522379588.0, + "step": 14334 + }, + { + "epoch": 2.6620241411327763, + "grad_norm": 1.6308050155639648, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8817110657691956, + "num_tokens": 522415223.0, + "step": 14335 + }, + { + "epoch": 2.662209842154132, + "grad_norm": 1.6948859691619873, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8825892806053162, + "num_tokens": 522450255.0, + "step": 14336 + }, + { + "epoch": 2.6623955431754873, + "grad_norm": 1.5965702533721924, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8922979235649109, + "num_tokens": 522485300.0, + "step": 14337 + }, + { + "epoch": 2.662581244196843, + "grad_norm": 1.6743931770324707, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8903658986091614, + "num_tokens": 522518250.0, + "step": 14338 + }, + { + "epoch": 2.662766945218199, + "grad_norm": 1.616829752922058, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8795905113220215, + "num_tokens": 522556258.0, + "step": 14339 + }, + { + "epoch": 2.662952646239554, + "grad_norm": 1.5211588144302368, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8856185674667358, + "num_tokens": 522598024.0, + "step": 14340 + }, + { + "epoch": 2.66313834726091, + "grad_norm": 1.7843761444091797, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.879177451133728, + "num_tokens": 522632967.0, + "step": 14341 + }, + { + "epoch": 2.6633240482822655, + "grad_norm": 1.5143905878067017, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8900865316390991, + "num_tokens": 522673907.0, + "step": 14342 + }, + { + "epoch": 2.6635097493036213, + "grad_norm": 1.490721583366394, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8908743858337402, + "num_tokens": 522713614.0, + "step": 14343 + }, + { + "epoch": 2.663695450324977, + "grad_norm": 1.6196125745773315, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8854624032974243, + "num_tokens": 522751009.0, + "step": 14344 + }, + { + "epoch": 2.6638811513463323, + "grad_norm": 1.4218019247055054, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.894495964050293, + "num_tokens": 522794631.0, + "step": 14345 + }, + { + "epoch": 2.664066852367688, + "grad_norm": 1.7064553499221802, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8726979494094849, + "num_tokens": 522831506.0, + "step": 14346 + }, + { + "epoch": 2.6642525533890438, + "grad_norm": 1.553467035293579, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.8968547582626343, + "num_tokens": 522867642.0, + "step": 14347 + }, + { + "epoch": 2.664438254410399, + "grad_norm": 1.5645800828933716, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.887546718120575, + "num_tokens": 522906713.0, + "step": 14348 + }, + { + "epoch": 2.664623955431755, + "grad_norm": 1.528253197669983, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8889641761779785, + "num_tokens": 522947362.0, + "step": 14349 + }, + { + "epoch": 2.6648096564531105, + "grad_norm": 1.7039977312088013, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8844050168991089, + "num_tokens": 522981132.0, + "step": 14350 + }, + { + "epoch": 2.6649953574744663, + "grad_norm": 1.566603422164917, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8981162905693054, + "num_tokens": 523016751.0, + "step": 14351 + }, + { + "epoch": 2.665181058495822, + "grad_norm": 1.6559416055679321, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8864837884902954, + "num_tokens": 523049771.0, + "step": 14352 + }, + { + "epoch": 2.6653667595171773, + "grad_norm": 1.58437180519104, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8967896699905396, + "num_tokens": 523087525.0, + "step": 14353 + }, + { + "epoch": 2.665552460538533, + "grad_norm": 1.5384981632232666, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8974453806877136, + "num_tokens": 523124364.0, + "step": 14354 + }, + { + "epoch": 2.6657381615598887, + "grad_norm": 1.490999698638916, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8855024576187134, + "num_tokens": 523165199.0, + "step": 14355 + }, + { + "epoch": 2.665923862581244, + "grad_norm": 1.5889387130737305, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8751814365386963, + "num_tokens": 523202736.0, + "step": 14356 + }, + { + "epoch": 2.6661095636025998, + "grad_norm": 1.536385178565979, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8970493078231812, + "num_tokens": 523238477.0, + "step": 14357 + }, + { + "epoch": 2.6662952646239555, + "grad_norm": 1.5606354475021362, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8964093327522278, + "num_tokens": 523275068.0, + "step": 14358 + }, + { + "epoch": 2.6664809656453112, + "grad_norm": 1.4651503562927246, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8767696619033813, + "num_tokens": 523320422.0, + "step": 14359 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.5255242586135864, + "learning_rate": 1e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9028663635253906, + "num_tokens": 523353812.0, + "step": 14360 + }, + { + "epoch": 2.6668523676880223, + "grad_norm": 1.5269345045089722, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8904030323028564, + "num_tokens": 523392582.0, + "step": 14361 + }, + { + "epoch": 2.667038068709378, + "grad_norm": 1.5556914806365967, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8888376355171204, + "num_tokens": 523431268.0, + "step": 14362 + }, + { + "epoch": 2.6672237697307333, + "grad_norm": 1.515152096748352, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8838350772857666, + "num_tokens": 523472723.0, + "step": 14363 + }, + { + "epoch": 2.667409470752089, + "grad_norm": 1.626267671585083, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8600486516952515, + "num_tokens": 523514297.0, + "step": 14364 + }, + { + "epoch": 2.6675951717734447, + "grad_norm": 1.6917073726654053, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8946187496185303, + "num_tokens": 523547785.0, + "step": 14365 + }, + { + "epoch": 2.6677808727948005, + "grad_norm": 1.6488313674926758, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8883841037750244, + "num_tokens": 523582441.0, + "step": 14366 + }, + { + "epoch": 2.667966573816156, + "grad_norm": 1.5257145166397095, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.892663836479187, + "num_tokens": 523621331.0, + "step": 14367 + }, + { + "epoch": 2.6681522748375115, + "grad_norm": 1.8030831813812256, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8862642049789429, + "num_tokens": 523653073.0, + "step": 14368 + }, + { + "epoch": 2.6683379758588672, + "grad_norm": 1.6353391408920288, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.877232551574707, + "num_tokens": 523692011.0, + "step": 14369 + }, + { + "epoch": 2.668523676880223, + "grad_norm": 1.759302020072937, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8835300207138062, + "num_tokens": 523726579.0, + "step": 14370 + }, + { + "epoch": 2.6687093779015783, + "grad_norm": 1.7789298295974731, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9011468887329102, + "num_tokens": 523755930.0, + "step": 14371 + }, + { + "epoch": 2.668895078922934, + "grad_norm": 1.6943328380584717, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8681738972663879, + "num_tokens": 523793535.0, + "step": 14372 + }, + { + "epoch": 2.6690807799442897, + "grad_norm": 1.807199239730835, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.883795976638794, + "num_tokens": 523826026.0, + "step": 14373 + }, + { + "epoch": 2.6692664809656454, + "grad_norm": 1.5274884700775146, + "learning_rate": 1e-06, + "loss": 0.2708, + "mean_token_accuracy": 0.9023311138153076, + "num_tokens": 523869615.0, + "step": 14374 + }, + { + "epoch": 2.669452181987001, + "grad_norm": 1.708498477935791, + "learning_rate": 1e-06, + "loss": 0.2606, + "mean_token_accuracy": 0.9060444235801697, + "num_tokens": 523897433.0, + "step": 14375 + }, + { + "epoch": 2.6696378830083565, + "grad_norm": 1.507080316543579, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8833959698677063, + "num_tokens": 523935159.0, + "step": 14376 + }, + { + "epoch": 2.669823584029712, + "grad_norm": 1.5703808069229126, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8906984329223633, + "num_tokens": 523972542.0, + "step": 14377 + }, + { + "epoch": 2.670009285051068, + "grad_norm": 1.5878849029541016, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8795965909957886, + "num_tokens": 524011949.0, + "step": 14378 + }, + { + "epoch": 2.6701949860724232, + "grad_norm": 1.6092780828475952, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8826834559440613, + "num_tokens": 524050306.0, + "step": 14379 + }, + { + "epoch": 2.670380687093779, + "grad_norm": 1.5739303827285767, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8786646127700806, + "num_tokens": 524089813.0, + "step": 14380 + }, + { + "epoch": 2.6705663881151347, + "grad_norm": 1.6120562553405762, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8791252374649048, + "num_tokens": 524133897.0, + "step": 14381 + }, + { + "epoch": 2.6707520891364904, + "grad_norm": 1.6858221292495728, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8796277046203613, + "num_tokens": 524170116.0, + "step": 14382 + }, + { + "epoch": 2.6709377901578457, + "grad_norm": 1.4945616722106934, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8972330093383789, + "num_tokens": 524207399.0, + "step": 14383 + }, + { + "epoch": 2.6711234911792014, + "grad_norm": 1.599922776222229, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8912760019302368, + "num_tokens": 524244485.0, + "step": 14384 + }, + { + "epoch": 2.671309192200557, + "grad_norm": 1.6244478225708008, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8847779035568237, + "num_tokens": 524281320.0, + "step": 14385 + }, + { + "epoch": 2.6714948932219125, + "grad_norm": 1.597580909729004, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8894786834716797, + "num_tokens": 524321991.0, + "step": 14386 + }, + { + "epoch": 2.671680594243268, + "grad_norm": 1.76140558719635, + "learning_rate": 1e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.8976618051528931, + "num_tokens": 524353471.0, + "step": 14387 + }, + { + "epoch": 2.671866295264624, + "grad_norm": 1.5764786005020142, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8956152200698853, + "num_tokens": 524389014.0, + "step": 14388 + }, + { + "epoch": 2.6720519962859797, + "grad_norm": 1.6637349128723145, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8969421982765198, + "num_tokens": 524423945.0, + "step": 14389 + }, + { + "epoch": 2.6722376973073354, + "grad_norm": 1.7262542247772217, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8749221563339233, + "num_tokens": 524461223.0, + "step": 14390 + }, + { + "epoch": 2.6724233983286907, + "grad_norm": 1.5651503801345825, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8779277801513672, + "num_tokens": 524500876.0, + "step": 14391 + }, + { + "epoch": 2.6726090993500464, + "grad_norm": 1.7199382781982422, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8830698728561401, + "num_tokens": 524535862.0, + "step": 14392 + }, + { + "epoch": 2.672794800371402, + "grad_norm": 1.694625735282898, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8938452005386353, + "num_tokens": 524569661.0, + "step": 14393 + }, + { + "epoch": 2.6729805013927574, + "grad_norm": 1.7108490467071533, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8698365092277527, + "num_tokens": 524604129.0, + "step": 14394 + }, + { + "epoch": 2.673166202414113, + "grad_norm": 1.5329228639602661, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8964289426803589, + "num_tokens": 524642130.0, + "step": 14395 + }, + { + "epoch": 2.673351903435469, + "grad_norm": 1.6413172483444214, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8869445323944092, + "num_tokens": 524681030.0, + "step": 14396 + }, + { + "epoch": 2.6735376044568246, + "grad_norm": 1.588231086730957, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8763595819473267, + "num_tokens": 524722203.0, + "step": 14397 + }, + { + "epoch": 2.6737233054781804, + "grad_norm": 1.5588396787643433, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8851783871650696, + "num_tokens": 524757850.0, + "step": 14398 + }, + { + "epoch": 2.6739090064995357, + "grad_norm": 1.5416020154953003, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8938162922859192, + "num_tokens": 524794500.0, + "step": 14399 + }, + { + "epoch": 2.6740947075208914, + "grad_norm": 1.6691479682922363, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8801422715187073, + "num_tokens": 524830468.0, + "step": 14400 + }, + { + "epoch": 2.674280408542247, + "grad_norm": 1.6540619134902954, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8971071839332581, + "num_tokens": 524866663.0, + "step": 14401 + }, + { + "epoch": 2.6744661095636024, + "grad_norm": 1.6688119173049927, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8918895125389099, + "num_tokens": 524899389.0, + "step": 14402 + }, + { + "epoch": 2.674651810584958, + "grad_norm": 1.5437421798706055, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8808455467224121, + "num_tokens": 524944104.0, + "step": 14403 + }, + { + "epoch": 2.674837511606314, + "grad_norm": 1.5309362411499023, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8752877712249756, + "num_tokens": 524987592.0, + "step": 14404 + }, + { + "epoch": 2.6750232126276696, + "grad_norm": 1.5620968341827393, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8912795186042786, + "num_tokens": 525023557.0, + "step": 14405 + }, + { + "epoch": 2.675208913649025, + "grad_norm": 1.5196055173873901, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.891521692276001, + "num_tokens": 525061860.0, + "step": 14406 + }, + { + "epoch": 2.6753946146703806, + "grad_norm": 1.732987642288208, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8854371309280396, + "num_tokens": 525091929.0, + "step": 14407 + }, + { + "epoch": 2.6755803156917364, + "grad_norm": 1.5405278205871582, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8897185325622559, + "num_tokens": 525132609.0, + "step": 14408 + }, + { + "epoch": 2.6757660167130917, + "grad_norm": 1.649291753768921, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8648808002471924, + "num_tokens": 525174738.0, + "step": 14409 + }, + { + "epoch": 2.6759517177344474, + "grad_norm": 1.6103615760803223, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8957218527793884, + "num_tokens": 525206978.0, + "step": 14410 + }, + { + "epoch": 2.676137418755803, + "grad_norm": 1.823696255683899, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8756356239318848, + "num_tokens": 525241274.0, + "step": 14411 + }, + { + "epoch": 2.676323119777159, + "grad_norm": 1.4903396368026733, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8915748000144958, + "num_tokens": 525281386.0, + "step": 14412 + }, + { + "epoch": 2.6765088207985146, + "grad_norm": 1.5502827167510986, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8904048204421997, + "num_tokens": 525320538.0, + "step": 14413 + }, + { + "epoch": 2.67669452181987, + "grad_norm": 1.64293372631073, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8789052367210388, + "num_tokens": 525358123.0, + "step": 14414 + }, + { + "epoch": 2.6768802228412256, + "grad_norm": 1.8087795972824097, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8778023719787598, + "num_tokens": 525393302.0, + "step": 14415 + }, + { + "epoch": 2.6770659238625814, + "grad_norm": 1.3963415622711182, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8967899084091187, + "num_tokens": 525437533.0, + "step": 14416 + }, + { + "epoch": 2.6772516248839366, + "grad_norm": 1.7652623653411865, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8886408805847168, + "num_tokens": 525475676.0, + "step": 14417 + }, + { + "epoch": 2.6774373259052924, + "grad_norm": 1.621055006980896, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8869585990905762, + "num_tokens": 525510730.0, + "step": 14418 + }, + { + "epoch": 2.677623026926648, + "grad_norm": 1.4249553680419922, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8954207897186279, + "num_tokens": 525549811.0, + "step": 14419 + }, + { + "epoch": 2.677808727948004, + "grad_norm": 1.6538609266281128, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8809497356414795, + "num_tokens": 525585406.0, + "step": 14420 + }, + { + "epoch": 2.6779944289693596, + "grad_norm": 1.6472091674804688, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8819092512130737, + "num_tokens": 525621910.0, + "step": 14421 + }, + { + "epoch": 2.678180129990715, + "grad_norm": 1.5917447805404663, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8896023631095886, + "num_tokens": 525659462.0, + "step": 14422 + }, + { + "epoch": 2.6783658310120706, + "grad_norm": 1.5290734767913818, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8878421783447266, + "num_tokens": 525697951.0, + "step": 14423 + }, + { + "epoch": 2.6785515320334263, + "grad_norm": 1.5104795694351196, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8856688737869263, + "num_tokens": 525737784.0, + "step": 14424 + }, + { + "epoch": 2.6787372330547816, + "grad_norm": 1.5723247528076172, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8862273693084717, + "num_tokens": 525777464.0, + "step": 14425 + }, + { + "epoch": 2.6789229340761374, + "grad_norm": 1.8190282583236694, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8817479610443115, + "num_tokens": 525806326.0, + "step": 14426 + }, + { + "epoch": 2.679108635097493, + "grad_norm": 1.5799973011016846, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8928714990615845, + "num_tokens": 525840978.0, + "step": 14427 + }, + { + "epoch": 2.679294336118849, + "grad_norm": 1.520478367805481, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8934152722358704, + "num_tokens": 525879967.0, + "step": 14428 + }, + { + "epoch": 2.679480037140204, + "grad_norm": 1.4023666381835938, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8963024616241455, + "num_tokens": 525920029.0, + "step": 14429 + }, + { + "epoch": 2.67966573816156, + "grad_norm": 1.6685045957565308, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8896995186805725, + "num_tokens": 525952659.0, + "step": 14430 + }, + { + "epoch": 2.6798514391829156, + "grad_norm": 1.580911636352539, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8919857144355774, + "num_tokens": 525989196.0, + "step": 14431 + }, + { + "epoch": 2.680037140204271, + "grad_norm": 1.5972976684570312, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8835113048553467, + "num_tokens": 526023785.0, + "step": 14432 + }, + { + "epoch": 2.6802228412256266, + "grad_norm": 1.582656741142273, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.891491174697876, + "num_tokens": 526058138.0, + "step": 14433 + }, + { + "epoch": 2.6804085422469823, + "grad_norm": 1.4911566972732544, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8887443542480469, + "num_tokens": 526099728.0, + "step": 14434 + }, + { + "epoch": 2.680594243268338, + "grad_norm": 1.5621893405914307, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8948094844818115, + "num_tokens": 526135279.0, + "step": 14435 + }, + { + "epoch": 2.680779944289694, + "grad_norm": 1.591370940208435, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8951948881149292, + "num_tokens": 526172204.0, + "step": 14436 + }, + { + "epoch": 2.680965645311049, + "grad_norm": 1.6169726848602295, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8854027986526489, + "num_tokens": 526209921.0, + "step": 14437 + }, + { + "epoch": 2.681151346332405, + "grad_norm": 1.6266204118728638, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8817092180252075, + "num_tokens": 526247162.0, + "step": 14438 + }, + { + "epoch": 2.6813370473537605, + "grad_norm": 1.6060047149658203, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8863959312438965, + "num_tokens": 526281830.0, + "step": 14439 + }, + { + "epoch": 2.681522748375116, + "grad_norm": 1.5651096105575562, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8813079595565796, + "num_tokens": 526316687.0, + "step": 14440 + }, + { + "epoch": 2.6817084493964716, + "grad_norm": 1.4881162643432617, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.878555417060852, + "num_tokens": 526358182.0, + "step": 14441 + }, + { + "epoch": 2.6818941504178273, + "grad_norm": 1.7874400615692139, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8840062022209167, + "num_tokens": 526388241.0, + "step": 14442 + }, + { + "epoch": 2.682079851439183, + "grad_norm": 1.4880532026290894, + "learning_rate": 1e-06, + "loss": 0.2796, + "mean_token_accuracy": 0.9002649188041687, + "num_tokens": 526424885.0, + "step": 14443 + }, + { + "epoch": 2.6822655524605388, + "grad_norm": 1.707656741142273, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8843100070953369, + "num_tokens": 526459350.0, + "step": 14444 + }, + { + "epoch": 2.682451253481894, + "grad_norm": 1.4943727254867554, + "learning_rate": 1e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.8988819122314453, + "num_tokens": 526495207.0, + "step": 14445 + }, + { + "epoch": 2.68263695450325, + "grad_norm": 1.5886406898498535, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8971095085144043, + "num_tokens": 526531134.0, + "step": 14446 + }, + { + "epoch": 2.6828226555246055, + "grad_norm": 1.6282862424850464, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.894265353679657, + "num_tokens": 526565801.0, + "step": 14447 + }, + { + "epoch": 2.683008356545961, + "grad_norm": 1.7086013555526733, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8956377506256104, + "num_tokens": 526599377.0, + "step": 14448 + }, + { + "epoch": 2.6831940575673165, + "grad_norm": 1.775803804397583, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8603687882423401, + "num_tokens": 526635719.0, + "step": 14449 + }, + { + "epoch": 2.6833797585886723, + "grad_norm": 1.5436235666275024, + "learning_rate": 1e-06, + "loss": 0.2618, + "mean_token_accuracy": 0.9062680006027222, + "num_tokens": 526671350.0, + "step": 14450 + }, + { + "epoch": 2.683565459610028, + "grad_norm": 1.5856136083602905, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8979748487472534, + "num_tokens": 526706406.0, + "step": 14451 + }, + { + "epoch": 2.6837511606313837, + "grad_norm": 1.6180821657180786, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8885804414749146, + "num_tokens": 526740017.0, + "step": 14452 + }, + { + "epoch": 2.683936861652739, + "grad_norm": 1.5880012512207031, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8822399377822876, + "num_tokens": 526776294.0, + "step": 14453 + }, + { + "epoch": 2.6841225626740948, + "grad_norm": 1.7382985353469849, + "learning_rate": 1e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9031879901885986, + "num_tokens": 526804268.0, + "step": 14454 + }, + { + "epoch": 2.68430826369545, + "grad_norm": 1.5758885145187378, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8831572532653809, + "num_tokens": 526841658.0, + "step": 14455 + }, + { + "epoch": 2.684493964716806, + "grad_norm": 1.5206375122070312, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8804908394813538, + "num_tokens": 526885837.0, + "step": 14456 + }, + { + "epoch": 2.6846796657381615, + "grad_norm": 1.5347093343734741, + "learning_rate": 1e-06, + "loss": 0.2732, + "mean_token_accuracy": 0.8999546766281128, + "num_tokens": 526920405.0, + "step": 14457 + }, + { + "epoch": 2.6848653667595173, + "grad_norm": 1.6457067728042603, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8704246282577515, + "num_tokens": 526957166.0, + "step": 14458 + }, + { + "epoch": 2.685051067780873, + "grad_norm": 1.5774165391921997, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8748877048492432, + "num_tokens": 526998403.0, + "step": 14459 + }, + { + "epoch": 2.6852367688022283, + "grad_norm": 1.6539027690887451, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8949127197265625, + "num_tokens": 527029423.0, + "step": 14460 + }, + { + "epoch": 2.685422469823584, + "grad_norm": 1.7127214670181274, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8932520151138306, + "num_tokens": 527060650.0, + "step": 14461 + }, + { + "epoch": 2.6856081708449397, + "grad_norm": 1.581892490386963, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.9001717567443848, + "num_tokens": 527098343.0, + "step": 14462 + }, + { + "epoch": 2.685793871866295, + "grad_norm": 1.6905871629714966, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.884907066822052, + "num_tokens": 527131393.0, + "step": 14463 + }, + { + "epoch": 2.6859795728876508, + "grad_norm": 1.7341548204421997, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8856297731399536, + "num_tokens": 527165014.0, + "step": 14464 + }, + { + "epoch": 2.6861652739090065, + "grad_norm": 1.5467625856399536, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8951312303543091, + "num_tokens": 527206014.0, + "step": 14465 + }, + { + "epoch": 2.6863509749303622, + "grad_norm": 1.7234727144241333, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8727081418037415, + "num_tokens": 527240716.0, + "step": 14466 + }, + { + "epoch": 2.686536675951718, + "grad_norm": 1.571969985961914, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8866593837738037, + "num_tokens": 527278466.0, + "step": 14467 + }, + { + "epoch": 2.6867223769730733, + "grad_norm": 1.829760193824768, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8877829909324646, + "num_tokens": 527307029.0, + "step": 14468 + }, + { + "epoch": 2.686908077994429, + "grad_norm": 1.7725000381469727, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8852400779724121, + "num_tokens": 527338738.0, + "step": 14469 + }, + { + "epoch": 2.6870937790157847, + "grad_norm": 1.5745981931686401, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8859936594963074, + "num_tokens": 527379364.0, + "step": 14470 + }, + { + "epoch": 2.68727948003714, + "grad_norm": 1.3866634368896484, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8917936086654663, + "num_tokens": 527425148.0, + "step": 14471 + }, + { + "epoch": 2.6874651810584957, + "grad_norm": 1.6659005880355835, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8954939842224121, + "num_tokens": 527460226.0, + "step": 14472 + }, + { + "epoch": 2.6876508820798515, + "grad_norm": 1.5986988544464111, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.890353798866272, + "num_tokens": 527496321.0, + "step": 14473 + }, + { + "epoch": 2.687836583101207, + "grad_norm": 1.7267190217971802, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8862367272377014, + "num_tokens": 527535316.0, + "step": 14474 + }, + { + "epoch": 2.688022284122563, + "grad_norm": 1.6435153484344482, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8806031942367554, + "num_tokens": 527573215.0, + "step": 14475 + }, + { + "epoch": 2.6882079851439182, + "grad_norm": 1.7278084754943848, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8946675658226013, + "num_tokens": 527603794.0, + "step": 14476 + }, + { + "epoch": 2.688393686165274, + "grad_norm": 1.7563884258270264, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8779720067977905, + "num_tokens": 527645511.0, + "step": 14477 + }, + { + "epoch": 2.6885793871866293, + "grad_norm": 1.6578668355941772, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8747689723968506, + "num_tokens": 527680845.0, + "step": 14478 + }, + { + "epoch": 2.688765088207985, + "grad_norm": 1.6014142036437988, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.9009276628494263, + "num_tokens": 527715103.0, + "step": 14479 + }, + { + "epoch": 2.6889507892293407, + "grad_norm": 1.7001103162765503, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8905340433120728, + "num_tokens": 527747850.0, + "step": 14480 + }, + { + "epoch": 2.6891364902506965, + "grad_norm": 1.7287886142730713, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8918893337249756, + "num_tokens": 527779341.0, + "step": 14481 + }, + { + "epoch": 2.689322191272052, + "grad_norm": 1.6111961603164673, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8876831531524658, + "num_tokens": 527817681.0, + "step": 14482 + }, + { + "epoch": 2.6895078922934075, + "grad_norm": 1.6078802347183228, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8770508170127869, + "num_tokens": 527856993.0, + "step": 14483 + }, + { + "epoch": 2.689693593314763, + "grad_norm": 1.7203023433685303, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8827043771743774, + "num_tokens": 527893064.0, + "step": 14484 + }, + { + "epoch": 2.689879294336119, + "grad_norm": 1.425830364227295, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.900154173374176, + "num_tokens": 527932001.0, + "step": 14485 + }, + { + "epoch": 2.6900649953574742, + "grad_norm": 1.714790940284729, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8844103813171387, + "num_tokens": 527968949.0, + "step": 14486 + }, + { + "epoch": 2.69025069637883, + "grad_norm": 1.5337797403335571, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.8995530605316162, + "num_tokens": 528008803.0, + "step": 14487 + }, + { + "epoch": 2.6904363974001857, + "grad_norm": 1.6135671138763428, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8923326730728149, + "num_tokens": 528041842.0, + "step": 14488 + }, + { + "epoch": 2.6906220984215414, + "grad_norm": 1.6757975816726685, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8903624415397644, + "num_tokens": 528077870.0, + "step": 14489 + }, + { + "epoch": 2.690807799442897, + "grad_norm": 1.6894965171813965, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8891419768333435, + "num_tokens": 528111194.0, + "step": 14490 + }, + { + "epoch": 2.6909935004642525, + "grad_norm": 1.556904673576355, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8946309089660645, + "num_tokens": 528150703.0, + "step": 14491 + }, + { + "epoch": 2.691179201485608, + "grad_norm": 1.6942187547683716, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.888507604598999, + "num_tokens": 528181926.0, + "step": 14492 + }, + { + "epoch": 2.691364902506964, + "grad_norm": 1.6977903842926025, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.889126718044281, + "num_tokens": 528218888.0, + "step": 14493 + }, + { + "epoch": 2.691550603528319, + "grad_norm": 1.6860135793685913, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8753597736358643, + "num_tokens": 528252005.0, + "step": 14494 + }, + { + "epoch": 2.691736304549675, + "grad_norm": 1.7966904640197754, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8841177225112915, + "num_tokens": 528283381.0, + "step": 14495 + }, + { + "epoch": 2.6919220055710307, + "grad_norm": 1.598587155342102, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.8971513509750366, + "num_tokens": 528317868.0, + "step": 14496 + }, + { + "epoch": 2.6921077065923864, + "grad_norm": 1.6568862199783325, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8922759294509888, + "num_tokens": 528352974.0, + "step": 14497 + }, + { + "epoch": 2.692293407613742, + "grad_norm": 1.5496516227722168, + "learning_rate": 1e-06, + "loss": 0.2741, + "mean_token_accuracy": 0.900071382522583, + "num_tokens": 528391121.0, + "step": 14498 + }, + { + "epoch": 2.6924791086350974, + "grad_norm": 1.9480141401290894, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8880804777145386, + "num_tokens": 528420740.0, + "step": 14499 + }, + { + "epoch": 2.692664809656453, + "grad_norm": 1.605450987815857, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8869971036911011, + "num_tokens": 528458442.0, + "step": 14500 + }, + { + "epoch": 2.6928505106778085, + "grad_norm": 1.619832992553711, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8901463150978088, + "num_tokens": 528494141.0, + "step": 14501 + }, + { + "epoch": 2.693036211699164, + "grad_norm": 1.7125858068466187, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8900488615036011, + "num_tokens": 528528586.0, + "step": 14502 + }, + { + "epoch": 2.69322191272052, + "grad_norm": 1.4383785724639893, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.8983215093612671, + "num_tokens": 528569295.0, + "step": 14503 + }, + { + "epoch": 2.6934076137418757, + "grad_norm": 1.7443625926971436, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8788324594497681, + "num_tokens": 528601240.0, + "step": 14504 + }, + { + "epoch": 2.6935933147632314, + "grad_norm": 1.5987200736999512, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8898229002952576, + "num_tokens": 528635413.0, + "step": 14505 + }, + { + "epoch": 2.6937790157845867, + "grad_norm": 1.7646290063858032, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8899716734886169, + "num_tokens": 528665466.0, + "step": 14506 + }, + { + "epoch": 2.6939647168059424, + "grad_norm": 1.6194138526916504, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8813046216964722, + "num_tokens": 528703504.0, + "step": 14507 + }, + { + "epoch": 2.694150417827298, + "grad_norm": 1.64040207862854, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8751864433288574, + "num_tokens": 528739930.0, + "step": 14508 + }, + { + "epoch": 2.6943361188486534, + "grad_norm": 1.951815128326416, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8827698230743408, + "num_tokens": 528779887.0, + "step": 14509 + }, + { + "epoch": 2.694521819870009, + "grad_norm": 1.4840912818908691, + "learning_rate": 1e-06, + "loss": 0.2735, + "mean_token_accuracy": 0.8997343182563782, + "num_tokens": 528818458.0, + "step": 14510 + }, + { + "epoch": 2.694707520891365, + "grad_norm": 1.574454665184021, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8844096660614014, + "num_tokens": 528856918.0, + "step": 14511 + }, + { + "epoch": 2.6948932219127206, + "grad_norm": 1.5761233568191528, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8860983848571777, + "num_tokens": 528898201.0, + "step": 14512 + }, + { + "epoch": 2.6950789229340764, + "grad_norm": 1.709549903869629, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8878376483917236, + "num_tokens": 528929917.0, + "step": 14513 + }, + { + "epoch": 2.6952646239554316, + "grad_norm": 1.5842550992965698, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8816424012184143, + "num_tokens": 528968340.0, + "step": 14514 + }, + { + "epoch": 2.6954503249767874, + "grad_norm": 1.6949056386947632, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.893526554107666, + "num_tokens": 528999935.0, + "step": 14515 + }, + { + "epoch": 2.695636025998143, + "grad_norm": 1.5121287107467651, + "learning_rate": 1e-06, + "loss": 0.2666, + "mean_token_accuracy": 0.9017801284790039, + "num_tokens": 529036606.0, + "step": 14516 + }, + { + "epoch": 2.6958217270194984, + "grad_norm": 1.711965799331665, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8824251294136047, + "num_tokens": 529070443.0, + "step": 14517 + }, + { + "epoch": 2.696007428040854, + "grad_norm": 1.7317034006118774, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8711154460906982, + "num_tokens": 529104200.0, + "step": 14518 + }, + { + "epoch": 2.69619312906221, + "grad_norm": 1.7865411043167114, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8948208689689636, + "num_tokens": 529131747.0, + "step": 14519 + }, + { + "epoch": 2.6963788300835656, + "grad_norm": 1.617952585220337, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8955013751983643, + "num_tokens": 529163826.0, + "step": 14520 + }, + { + "epoch": 2.6965645311049213, + "grad_norm": 1.592724084854126, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8940684795379639, + "num_tokens": 529199920.0, + "step": 14521 + }, + { + "epoch": 2.6967502321262766, + "grad_norm": 1.6207653284072876, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8896180391311646, + "num_tokens": 529237066.0, + "step": 14522 + }, + { + "epoch": 2.6969359331476324, + "grad_norm": 1.6528923511505127, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8917470574378967, + "num_tokens": 529272476.0, + "step": 14523 + }, + { + "epoch": 2.697121634168988, + "grad_norm": 1.6127209663391113, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8899174928665161, + "num_tokens": 529307124.0, + "step": 14524 + }, + { + "epoch": 2.6973073351903434, + "grad_norm": 1.5297718048095703, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8897829055786133, + "num_tokens": 529346748.0, + "step": 14525 + }, + { + "epoch": 2.697493036211699, + "grad_norm": 1.6908460855484009, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8914564847946167, + "num_tokens": 529379597.0, + "step": 14526 + }, + { + "epoch": 2.697678737233055, + "grad_norm": 1.488890528678894, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8908108472824097, + "num_tokens": 529416572.0, + "step": 14527 + }, + { + "epoch": 2.6978644382544106, + "grad_norm": 1.553341269493103, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8845838308334351, + "num_tokens": 529457178.0, + "step": 14528 + }, + { + "epoch": 2.698050139275766, + "grad_norm": 1.6893799304962158, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8822804689407349, + "num_tokens": 529489539.0, + "step": 14529 + }, + { + "epoch": 2.6982358402971216, + "grad_norm": 1.659201741218567, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8834491968154907, + "num_tokens": 529523707.0, + "step": 14530 + }, + { + "epoch": 2.6984215413184773, + "grad_norm": 1.9037683010101318, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8893229961395264, + "num_tokens": 529549980.0, + "step": 14531 + }, + { + "epoch": 2.6986072423398326, + "grad_norm": 1.587315559387207, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8886470198631287, + "num_tokens": 529588232.0, + "step": 14532 + }, + { + "epoch": 2.6987929433611884, + "grad_norm": 1.8436864614486694, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8844590187072754, + "num_tokens": 529624872.0, + "step": 14533 + }, + { + "epoch": 2.698978644382544, + "grad_norm": 1.6080151796340942, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8650642037391663, + "num_tokens": 529665891.0, + "step": 14534 + }, + { + "epoch": 2.6991643454039, + "grad_norm": 1.6422101259231567, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8900366425514221, + "num_tokens": 529699957.0, + "step": 14535 + }, + { + "epoch": 2.6993500464252556, + "grad_norm": 1.5794066190719604, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8840001821517944, + "num_tokens": 529735901.0, + "step": 14536 + }, + { + "epoch": 2.699535747446611, + "grad_norm": 1.5914822816848755, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8800479769706726, + "num_tokens": 529773203.0, + "step": 14537 + }, + { + "epoch": 2.6997214484679666, + "grad_norm": 1.5991263389587402, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8747104406356812, + "num_tokens": 529811116.0, + "step": 14538 + }, + { + "epoch": 2.6999071494893223, + "grad_norm": 1.636596918106079, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8836016058921814, + "num_tokens": 529845510.0, + "step": 14539 + }, + { + "epoch": 2.7000928505106776, + "grad_norm": 1.6200687885284424, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8931819200515747, + "num_tokens": 529882048.0, + "step": 14540 + }, + { + "epoch": 2.7002785515320333, + "grad_norm": 1.4629192352294922, + "learning_rate": 1e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.9025287628173828, + "num_tokens": 529924194.0, + "step": 14541 + }, + { + "epoch": 2.700464252553389, + "grad_norm": 1.621152639389038, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8916653990745544, + "num_tokens": 529957170.0, + "step": 14542 + }, + { + "epoch": 2.700649953574745, + "grad_norm": 1.60764741897583, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8854904174804688, + "num_tokens": 529995352.0, + "step": 14543 + }, + { + "epoch": 2.7008356545961005, + "grad_norm": 1.4376307725906372, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8823678493499756, + "num_tokens": 530040207.0, + "step": 14544 + }, + { + "epoch": 2.701021355617456, + "grad_norm": 1.633529782295227, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8826521635055542, + "num_tokens": 530075619.0, + "step": 14545 + }, + { + "epoch": 2.7012070566388116, + "grad_norm": 1.8093533515930176, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8790313005447388, + "num_tokens": 530106315.0, + "step": 14546 + }, + { + "epoch": 2.7013927576601673, + "grad_norm": 1.6643983125686646, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8848035931587219, + "num_tokens": 530142263.0, + "step": 14547 + }, + { + "epoch": 2.7015784586815226, + "grad_norm": 1.523816704750061, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8803357481956482, + "num_tokens": 530182123.0, + "step": 14548 + }, + { + "epoch": 2.7017641597028783, + "grad_norm": 1.6580736637115479, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8949711322784424, + "num_tokens": 530214604.0, + "step": 14549 + }, + { + "epoch": 2.701949860724234, + "grad_norm": 1.6265794038772583, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8864522576332092, + "num_tokens": 530250348.0, + "step": 14550 + }, + { + "epoch": 2.7021355617455898, + "grad_norm": 1.5303266048431396, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8894037008285522, + "num_tokens": 530287917.0, + "step": 14551 + }, + { + "epoch": 2.702321262766945, + "grad_norm": 1.557673692703247, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8724291920661926, + "num_tokens": 530330119.0, + "step": 14552 + }, + { + "epoch": 2.702506963788301, + "grad_norm": 1.5103416442871094, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8899452686309814, + "num_tokens": 530368754.0, + "step": 14553 + }, + { + "epoch": 2.7026926648096565, + "grad_norm": 1.6522018909454346, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8761126399040222, + "num_tokens": 530402262.0, + "step": 14554 + }, + { + "epoch": 2.702878365831012, + "grad_norm": 1.8320647478103638, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8691104054450989, + "num_tokens": 530434409.0, + "step": 14555 + }, + { + "epoch": 2.7030640668523676, + "grad_norm": 1.879523515701294, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8828943967819214, + "num_tokens": 530464058.0, + "step": 14556 + }, + { + "epoch": 2.7032497678737233, + "grad_norm": 1.6406645774841309, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8871549367904663, + "num_tokens": 530502656.0, + "step": 14557 + }, + { + "epoch": 2.703435468895079, + "grad_norm": 1.7310014963150024, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8851909637451172, + "num_tokens": 530533946.0, + "step": 14558 + }, + { + "epoch": 2.7036211699164348, + "grad_norm": 1.7534065246582031, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8828790783882141, + "num_tokens": 530565305.0, + "step": 14559 + }, + { + "epoch": 2.70380687093779, + "grad_norm": 1.6369404792785645, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8865598440170288, + "num_tokens": 530603233.0, + "step": 14560 + }, + { + "epoch": 2.7039925719591458, + "grad_norm": 1.4197567701339722, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8854242563247681, + "num_tokens": 530649395.0, + "step": 14561 + }, + { + "epoch": 2.7041782729805015, + "grad_norm": 1.6953071355819702, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8823909759521484, + "num_tokens": 530681577.0, + "step": 14562 + }, + { + "epoch": 2.704363974001857, + "grad_norm": 1.627341628074646, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8843035697937012, + "num_tokens": 530719286.0, + "step": 14563 + }, + { + "epoch": 2.7045496750232125, + "grad_norm": 1.7146614789962769, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8898013830184937, + "num_tokens": 530751949.0, + "step": 14564 + }, + { + "epoch": 2.7047353760445683, + "grad_norm": 1.5898323059082031, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8860156536102295, + "num_tokens": 530787735.0, + "step": 14565 + }, + { + "epoch": 2.704921077065924, + "grad_norm": 1.6280208826065063, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8876707553863525, + "num_tokens": 530825059.0, + "step": 14566 + }, + { + "epoch": 2.7051067780872797, + "grad_norm": 1.563278317451477, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8930120468139648, + "num_tokens": 530862235.0, + "step": 14567 + }, + { + "epoch": 2.705292479108635, + "grad_norm": 1.5121487379074097, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8878005743026733, + "num_tokens": 530898943.0, + "step": 14568 + }, + { + "epoch": 2.7054781801299908, + "grad_norm": 1.5146770477294922, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.892107367515564, + "num_tokens": 530938461.0, + "step": 14569 + }, + { + "epoch": 2.7056638811513465, + "grad_norm": 1.4949584007263184, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8941681385040283, + "num_tokens": 530977879.0, + "step": 14570 + }, + { + "epoch": 2.7058495821727018, + "grad_norm": 1.5446776151657104, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8811650276184082, + "num_tokens": 531016210.0, + "step": 14571 + }, + { + "epoch": 2.7060352831940575, + "grad_norm": 1.6668602228164673, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8755214810371399, + "num_tokens": 531052594.0, + "step": 14572 + }, + { + "epoch": 2.7062209842154132, + "grad_norm": 1.679673433303833, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8696408271789551, + "num_tokens": 531088795.0, + "step": 14573 + }, + { + "epoch": 2.706406685236769, + "grad_norm": 1.4851531982421875, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8888002634048462, + "num_tokens": 531125929.0, + "step": 14574 + }, + { + "epoch": 2.7065923862581243, + "grad_norm": 1.75420081615448, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8893636465072632, + "num_tokens": 531155790.0, + "step": 14575 + }, + { + "epoch": 2.70677808727948, + "grad_norm": 1.5014828443527222, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8984395861625671, + "num_tokens": 531194382.0, + "step": 14576 + }, + { + "epoch": 2.7069637883008357, + "grad_norm": 1.6931865215301514, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.89613276720047, + "num_tokens": 531226228.0, + "step": 14577 + }, + { + "epoch": 2.707149489322191, + "grad_norm": 1.6335926055908203, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8867548108100891, + "num_tokens": 531263384.0, + "step": 14578 + }, + { + "epoch": 2.7073351903435467, + "grad_norm": 1.4983210563659668, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8916753530502319, + "num_tokens": 531304283.0, + "step": 14579 + }, + { + "epoch": 2.7075208913649025, + "grad_norm": 1.5783976316452026, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8985762000083923, + "num_tokens": 531337772.0, + "step": 14580 + }, + { + "epoch": 2.707706592386258, + "grad_norm": 1.6334319114685059, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.871523916721344, + "num_tokens": 531378371.0, + "step": 14581 + }, + { + "epoch": 2.707892293407614, + "grad_norm": 1.6940556764602661, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8830496668815613, + "num_tokens": 531413668.0, + "step": 14582 + }, + { + "epoch": 2.7080779944289692, + "grad_norm": 1.810644268989563, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8724850416183472, + "num_tokens": 531447901.0, + "step": 14583 + }, + { + "epoch": 2.708263695450325, + "grad_norm": 1.683384895324707, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8825104832649231, + "num_tokens": 531485797.0, + "step": 14584 + }, + { + "epoch": 2.7084493964716807, + "grad_norm": 1.5898319482803345, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8754633665084839, + "num_tokens": 531525292.0, + "step": 14585 + }, + { + "epoch": 2.708635097493036, + "grad_norm": 1.673546552658081, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8954603672027588, + "num_tokens": 531556159.0, + "step": 14586 + }, + { + "epoch": 2.7088207985143917, + "grad_norm": 1.4948885440826416, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8938531279563904, + "num_tokens": 531594343.0, + "step": 14587 + }, + { + "epoch": 2.7090064995357475, + "grad_norm": 1.5872658491134644, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8906921148300171, + "num_tokens": 531629908.0, + "step": 14588 + }, + { + "epoch": 2.709192200557103, + "grad_norm": 1.5982609987258911, + "learning_rate": 1e-06, + "loss": 0.2808, + "mean_token_accuracy": 0.898756742477417, + "num_tokens": 531662853.0, + "step": 14589 + }, + { + "epoch": 2.709377901578459, + "grad_norm": 1.5670207738876343, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8894076943397522, + "num_tokens": 531704633.0, + "step": 14590 + }, + { + "epoch": 2.709563602599814, + "grad_norm": 1.5527496337890625, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8883923888206482, + "num_tokens": 531739274.0, + "step": 14591 + }, + { + "epoch": 2.70974930362117, + "grad_norm": 1.5857456922531128, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8811241388320923, + "num_tokens": 531776715.0, + "step": 14592 + }, + { + "epoch": 2.7099350046425257, + "grad_norm": 1.5162674188613892, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8900202512741089, + "num_tokens": 531814207.0, + "step": 14593 + }, + { + "epoch": 2.710120705663881, + "grad_norm": 1.5478662252426147, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8857886791229248, + "num_tokens": 531855628.0, + "step": 14594 + }, + { + "epoch": 2.7103064066852367, + "grad_norm": 1.7119568586349487, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8966567516326904, + "num_tokens": 531890127.0, + "step": 14595 + }, + { + "epoch": 2.7104921077065924, + "grad_norm": 1.5116887092590332, + "learning_rate": 1e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.8983255624771118, + "num_tokens": 531928161.0, + "step": 14596 + }, + { + "epoch": 2.710677808727948, + "grad_norm": 1.4964734315872192, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8868196606636047, + "num_tokens": 531969651.0, + "step": 14597 + }, + { + "epoch": 2.7108635097493035, + "grad_norm": 1.7790371179580688, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8778078556060791, + "num_tokens": 532000177.0, + "step": 14598 + }, + { + "epoch": 2.711049210770659, + "grad_norm": 1.4729669094085693, + "learning_rate": 1e-06, + "loss": 0.2718, + "mean_token_accuracy": 0.9032612442970276, + "num_tokens": 532038401.0, + "step": 14599 + }, + { + "epoch": 2.711234911792015, + "grad_norm": 1.7608177661895752, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8741373419761658, + "num_tokens": 532075156.0, + "step": 14600 + }, + { + "epoch": 2.71142061281337, + "grad_norm": 1.7529191970825195, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8837541341781616, + "num_tokens": 532108923.0, + "step": 14601 + }, + { + "epoch": 2.711606313834726, + "grad_norm": 1.6838473081588745, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8861119151115417, + "num_tokens": 532145017.0, + "step": 14602 + }, + { + "epoch": 2.7117920148560817, + "grad_norm": 1.6492162942886353, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8930460214614868, + "num_tokens": 532181871.0, + "step": 14603 + }, + { + "epoch": 2.7119777158774374, + "grad_norm": 1.6648333072662354, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8932830691337585, + "num_tokens": 532214385.0, + "step": 14604 + }, + { + "epoch": 2.712163416898793, + "grad_norm": 1.5784329175949097, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8998057842254639, + "num_tokens": 532251736.0, + "step": 14605 + }, + { + "epoch": 2.7123491179201484, + "grad_norm": 1.750592827796936, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8772333860397339, + "num_tokens": 532282494.0, + "step": 14606 + }, + { + "epoch": 2.712534818941504, + "grad_norm": 1.669836401939392, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8823952674865723, + "num_tokens": 532314820.0, + "step": 14607 + }, + { + "epoch": 2.71272051996286, + "grad_norm": 1.6351573467254639, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8874114155769348, + "num_tokens": 532349326.0, + "step": 14608 + }, + { + "epoch": 2.712906220984215, + "grad_norm": 1.5985839366912842, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8803256750106812, + "num_tokens": 532389150.0, + "step": 14609 + }, + { + "epoch": 2.713091922005571, + "grad_norm": 1.5916706323623657, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8870531320571899, + "num_tokens": 532424353.0, + "step": 14610 + }, + { + "epoch": 2.7132776230269267, + "grad_norm": 1.730828046798706, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8805939555168152, + "num_tokens": 532457189.0, + "step": 14611 + }, + { + "epoch": 2.7134633240482824, + "grad_norm": 1.6295466423034668, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8896462917327881, + "num_tokens": 532493877.0, + "step": 14612 + }, + { + "epoch": 2.713649025069638, + "grad_norm": 1.9303830862045288, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8665879964828491, + "num_tokens": 532521102.0, + "step": 14613 + }, + { + "epoch": 2.7138347260909934, + "grad_norm": 1.5836950540542603, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8813842535018921, + "num_tokens": 532557248.0, + "step": 14614 + }, + { + "epoch": 2.714020427112349, + "grad_norm": 1.6706002950668335, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8805919885635376, + "num_tokens": 532591719.0, + "step": 14615 + }, + { + "epoch": 2.714206128133705, + "grad_norm": 1.5244054794311523, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8885716795921326, + "num_tokens": 532630842.0, + "step": 14616 + }, + { + "epoch": 2.71439182915506, + "grad_norm": 1.5861403942108154, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.884163498878479, + "num_tokens": 532667874.0, + "step": 14617 + }, + { + "epoch": 2.714577530176416, + "grad_norm": 1.667574167251587, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8880129456520081, + "num_tokens": 532703847.0, + "step": 14618 + }, + { + "epoch": 2.7147632311977716, + "grad_norm": 1.6603918075561523, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8893251419067383, + "num_tokens": 532740287.0, + "step": 14619 + }, + { + "epoch": 2.7149489322191274, + "grad_norm": 1.5560028553009033, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8939640522003174, + "num_tokens": 532780911.0, + "step": 14620 + }, + { + "epoch": 2.715134633240483, + "grad_norm": 1.786414384841919, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8653711080551147, + "num_tokens": 532814562.0, + "step": 14621 + }, + { + "epoch": 2.7153203342618384, + "grad_norm": 1.6399612426757812, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8942736387252808, + "num_tokens": 532849073.0, + "step": 14622 + }, + { + "epoch": 2.715506035283194, + "grad_norm": 1.615475058555603, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8935624957084656, + "num_tokens": 532880984.0, + "step": 14623 + }, + { + "epoch": 2.7156917363045494, + "grad_norm": 1.4521818161010742, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.885517418384552, + "num_tokens": 532923357.0, + "step": 14624 + }, + { + "epoch": 2.715877437325905, + "grad_norm": 1.5682131052017212, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.8979116678237915, + "num_tokens": 532957910.0, + "step": 14625 + }, + { + "epoch": 2.716063138347261, + "grad_norm": 1.5938823223114014, + "learning_rate": 1e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.899190366268158, + "num_tokens": 532995335.0, + "step": 14626 + }, + { + "epoch": 2.7162488393686166, + "grad_norm": 1.7522356510162354, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8881833553314209, + "num_tokens": 533028880.0, + "step": 14627 + }, + { + "epoch": 2.7164345403899723, + "grad_norm": 1.6290037631988525, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8771000504493713, + "num_tokens": 533070272.0, + "step": 14628 + }, + { + "epoch": 2.7166202414113276, + "grad_norm": 1.6478345394134521, + "learning_rate": 1e-06, + "loss": 0.265, + "mean_token_accuracy": 0.9008989334106445, + "num_tokens": 533104195.0, + "step": 14629 + }, + { + "epoch": 2.7168059424326834, + "grad_norm": 1.5081592798233032, + "learning_rate": 1e-06, + "loss": 0.2699, + "mean_token_accuracy": 0.903128981590271, + "num_tokens": 533145367.0, + "step": 14630 + }, + { + "epoch": 2.716991643454039, + "grad_norm": 1.6131806373596191, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8897380828857422, + "num_tokens": 533178652.0, + "step": 14631 + }, + { + "epoch": 2.7171773444753944, + "grad_norm": 1.5958428382873535, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8723753690719604, + "num_tokens": 533219041.0, + "step": 14632 + }, + { + "epoch": 2.71736304549675, + "grad_norm": 1.6931253671646118, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8860970735549927, + "num_tokens": 533254488.0, + "step": 14633 + }, + { + "epoch": 2.717548746518106, + "grad_norm": 1.6228787899017334, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.88761305809021, + "num_tokens": 533292889.0, + "step": 14634 + }, + { + "epoch": 2.7177344475394616, + "grad_norm": 1.67913019657135, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8882890939712524, + "num_tokens": 533327675.0, + "step": 14635 + }, + { + "epoch": 2.7179201485608173, + "grad_norm": 1.6666563749313354, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8991343975067139, + "num_tokens": 533358625.0, + "step": 14636 + }, + { + "epoch": 2.7181058495821726, + "grad_norm": 1.6905579566955566, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8766030669212341, + "num_tokens": 533392315.0, + "step": 14637 + }, + { + "epoch": 2.7182915506035283, + "grad_norm": 1.5952541828155518, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8945139646530151, + "num_tokens": 533428644.0, + "step": 14638 + }, + { + "epoch": 2.718477251624884, + "grad_norm": 1.4790483713150024, + "learning_rate": 1e-06, + "loss": 0.2544, + "mean_token_accuracy": 0.9074447154998779, + "num_tokens": 533465245.0, + "step": 14639 + }, + { + "epoch": 2.7186629526462394, + "grad_norm": 1.6225022077560425, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8958785533905029, + "num_tokens": 533498391.0, + "step": 14640 + }, + { + "epoch": 2.718848653667595, + "grad_norm": 1.6514142751693726, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8878750801086426, + "num_tokens": 533530578.0, + "step": 14641 + }, + { + "epoch": 2.719034354688951, + "grad_norm": 1.8182041645050049, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8916465640068054, + "num_tokens": 533560004.0, + "step": 14642 + }, + { + "epoch": 2.7192200557103066, + "grad_norm": 1.8747738599777222, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8895193338394165, + "num_tokens": 533586236.0, + "step": 14643 + }, + { + "epoch": 2.7194057567316623, + "grad_norm": 1.6330251693725586, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.890207827091217, + "num_tokens": 533623299.0, + "step": 14644 + }, + { + "epoch": 2.7195914577530176, + "grad_norm": 1.6651345491409302, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8862056136131287, + "num_tokens": 533658067.0, + "step": 14645 + }, + { + "epoch": 2.7197771587743733, + "grad_norm": 1.5298035144805908, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8976348638534546, + "num_tokens": 533694784.0, + "step": 14646 + }, + { + "epoch": 2.7199628597957286, + "grad_norm": 1.6732724905014038, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.882203221321106, + "num_tokens": 533733910.0, + "step": 14647 + }, + { + "epoch": 2.7201485608170843, + "grad_norm": 1.7872607707977295, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8816831707954407, + "num_tokens": 533768012.0, + "step": 14648 + }, + { + "epoch": 2.72033426183844, + "grad_norm": 1.5978593826293945, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8815426826477051, + "num_tokens": 533805781.0, + "step": 14649 + }, + { + "epoch": 2.720519962859796, + "grad_norm": 1.72994065284729, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.886461615562439, + "num_tokens": 533837125.0, + "step": 14650 + }, + { + "epoch": 2.7207056638811515, + "grad_norm": 1.629668116569519, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.8988218307495117, + "num_tokens": 533869220.0, + "step": 14651 + }, + { + "epoch": 2.720891364902507, + "grad_norm": 1.5966532230377197, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8922039270401001, + "num_tokens": 533905533.0, + "step": 14652 + }, + { + "epoch": 2.7210770659238626, + "grad_norm": 1.6585484743118286, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8932986259460449, + "num_tokens": 533938778.0, + "step": 14653 + }, + { + "epoch": 2.7212627669452183, + "grad_norm": 1.5176604986190796, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8911924958229065, + "num_tokens": 533975613.0, + "step": 14654 + }, + { + "epoch": 2.7214484679665736, + "grad_norm": 1.7025911808013916, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8803833723068237, + "num_tokens": 534013608.0, + "step": 14655 + }, + { + "epoch": 2.7216341689879293, + "grad_norm": 1.5823427438735962, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8921597003936768, + "num_tokens": 534050387.0, + "step": 14656 + }, + { + "epoch": 2.721819870009285, + "grad_norm": 1.54596745967865, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.877968430519104, + "num_tokens": 534089414.0, + "step": 14657 + }, + { + "epoch": 2.722005571030641, + "grad_norm": 1.6329526901245117, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8985905647277832, + "num_tokens": 534121405.0, + "step": 14658 + }, + { + "epoch": 2.7221912720519965, + "grad_norm": 1.6695696115493774, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8991013765335083, + "num_tokens": 534153161.0, + "step": 14659 + }, + { + "epoch": 2.722376973073352, + "grad_norm": 1.649458408355713, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8875983357429504, + "num_tokens": 534189355.0, + "step": 14660 + }, + { + "epoch": 2.7225626740947075, + "grad_norm": 1.62994384765625, + "learning_rate": 1e-06, + "loss": 0.2847, + "mean_token_accuracy": 0.8970446586608887, + "num_tokens": 534224396.0, + "step": 14661 + }, + { + "epoch": 2.7227483751160633, + "grad_norm": 1.6520154476165771, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8830127120018005, + "num_tokens": 534261166.0, + "step": 14662 + }, + { + "epoch": 2.7229340761374186, + "grad_norm": 1.628736138343811, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8820552825927734, + "num_tokens": 534296804.0, + "step": 14663 + }, + { + "epoch": 2.7231197771587743, + "grad_norm": 1.5274182558059692, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8842340111732483, + "num_tokens": 534334545.0, + "step": 14664 + }, + { + "epoch": 2.72330547818013, + "grad_norm": 1.7354718446731567, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8851375579833984, + "num_tokens": 534367288.0, + "step": 14665 + }, + { + "epoch": 2.7234911792014858, + "grad_norm": 1.5375406742095947, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8845298886299133, + "num_tokens": 534408959.0, + "step": 14666 + }, + { + "epoch": 2.7236768802228415, + "grad_norm": 1.5465288162231445, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8940653800964355, + "num_tokens": 534448889.0, + "step": 14667 + }, + { + "epoch": 2.723862581244197, + "grad_norm": 1.646761178970337, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8861196637153625, + "num_tokens": 534483003.0, + "step": 14668 + }, + { + "epoch": 2.7240482822655525, + "grad_norm": 1.710587739944458, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8815606832504272, + "num_tokens": 534517511.0, + "step": 14669 + }, + { + "epoch": 2.724233983286908, + "grad_norm": 1.4406126737594604, + "learning_rate": 1e-06, + "loss": 0.2529, + "mean_token_accuracy": 0.9063620567321777, + "num_tokens": 534555262.0, + "step": 14670 + }, + { + "epoch": 2.7244196843082635, + "grad_norm": 1.7876644134521484, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8850216865539551, + "num_tokens": 534585772.0, + "step": 14671 + }, + { + "epoch": 2.7246053853296193, + "grad_norm": 1.5923259258270264, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.883711576461792, + "num_tokens": 534623337.0, + "step": 14672 + }, + { + "epoch": 2.724791086350975, + "grad_norm": 1.5610461235046387, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8927216529846191, + "num_tokens": 534659989.0, + "step": 14673 + }, + { + "epoch": 2.7249767873723307, + "grad_norm": 1.6402249336242676, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8947191834449768, + "num_tokens": 534695684.0, + "step": 14674 + }, + { + "epoch": 2.725162488393686, + "grad_norm": 1.54056715965271, + "learning_rate": 1e-06, + "loss": 0.2538, + "mean_token_accuracy": 0.9085108637809753, + "num_tokens": 534730763.0, + "step": 14675 + }, + { + "epoch": 2.7253481894150418, + "grad_norm": 1.6320009231567383, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8971989154815674, + "num_tokens": 534766011.0, + "step": 14676 + }, + { + "epoch": 2.7255338904363975, + "grad_norm": 1.6365997791290283, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8780351281166077, + "num_tokens": 534802433.0, + "step": 14677 + }, + { + "epoch": 2.7257195914577528, + "grad_norm": 1.6956512928009033, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8881285786628723, + "num_tokens": 534839933.0, + "step": 14678 + }, + { + "epoch": 2.7259052924791085, + "grad_norm": 1.7420623302459717, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8773088455200195, + "num_tokens": 534877532.0, + "step": 14679 + }, + { + "epoch": 2.7260909935004642, + "grad_norm": 1.460220217704773, + "learning_rate": 1e-06, + "loss": 0.27, + "mean_token_accuracy": 0.9002500772476196, + "num_tokens": 534916890.0, + "step": 14680 + }, + { + "epoch": 2.72627669452182, + "grad_norm": 1.5621503591537476, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8919049501419067, + "num_tokens": 534954937.0, + "step": 14681 + }, + { + "epoch": 2.7264623955431757, + "grad_norm": 1.6041244268417358, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8811089396476746, + "num_tokens": 534993906.0, + "step": 14682 + }, + { + "epoch": 2.726648096564531, + "grad_norm": 1.5910661220550537, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8878999948501587, + "num_tokens": 535033781.0, + "step": 14683 + }, + { + "epoch": 2.7268337975858867, + "grad_norm": 1.6451114416122437, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8753887414932251, + "num_tokens": 535074995.0, + "step": 14684 + }, + { + "epoch": 2.7270194986072425, + "grad_norm": 1.6825402975082397, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8896569609642029, + "num_tokens": 535108466.0, + "step": 14685 + }, + { + "epoch": 2.7272051996285978, + "grad_norm": 1.585161805152893, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8826670050621033, + "num_tokens": 535149970.0, + "step": 14686 + }, + { + "epoch": 2.7273909006499535, + "grad_norm": 1.5622735023498535, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8873409032821655, + "num_tokens": 535189290.0, + "step": 14687 + }, + { + "epoch": 2.727576601671309, + "grad_norm": 1.595117449760437, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8804932832717896, + "num_tokens": 535229622.0, + "step": 14688 + }, + { + "epoch": 2.727762302692665, + "grad_norm": 1.6351168155670166, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8735044002532959, + "num_tokens": 535270091.0, + "step": 14689 + }, + { + "epoch": 2.7279480037140207, + "grad_norm": 1.653732419013977, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8805975317955017, + "num_tokens": 535306753.0, + "step": 14690 + }, + { + "epoch": 2.728133704735376, + "grad_norm": 1.679807424545288, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8890791535377502, + "num_tokens": 535341993.0, + "step": 14691 + }, + { + "epoch": 2.7283194057567317, + "grad_norm": 1.7408493757247925, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8804395198822021, + "num_tokens": 535376801.0, + "step": 14692 + }, + { + "epoch": 2.7285051067780874, + "grad_norm": 1.8511680364608765, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8860284090042114, + "num_tokens": 535407268.0, + "step": 14693 + }, + { + "epoch": 2.7286908077994427, + "grad_norm": 1.6918398141860962, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.887606680393219, + "num_tokens": 535444276.0, + "step": 14694 + }, + { + "epoch": 2.7288765088207985, + "grad_norm": 1.6475313901901245, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8891287446022034, + "num_tokens": 535484131.0, + "step": 14695 + }, + { + "epoch": 2.729062209842154, + "grad_norm": 1.4181941747665405, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8822454810142517, + "num_tokens": 535528363.0, + "step": 14696 + }, + { + "epoch": 2.72924791086351, + "grad_norm": 1.5706486701965332, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8912562131881714, + "num_tokens": 535564293.0, + "step": 14697 + }, + { + "epoch": 2.729433611884865, + "grad_norm": 1.527431845664978, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8951079845428467, + "num_tokens": 535603671.0, + "step": 14698 + }, + { + "epoch": 2.729619312906221, + "grad_norm": 1.6064766645431519, + "learning_rate": 1e-06, + "loss": 0.2809, + "mean_token_accuracy": 0.8988691568374634, + "num_tokens": 535640202.0, + "step": 14699 + }, + { + "epoch": 2.7298050139275767, + "grad_norm": 1.614236831665039, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8967152237892151, + "num_tokens": 535676148.0, + "step": 14700 + }, + { + "epoch": 2.729990714948932, + "grad_norm": 1.7502005100250244, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8851341605186462, + "num_tokens": 535710884.0, + "step": 14701 + }, + { + "epoch": 2.7301764159702877, + "grad_norm": 1.6289973258972168, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8815652132034302, + "num_tokens": 535746830.0, + "step": 14702 + }, + { + "epoch": 2.7303621169916434, + "grad_norm": 1.7657592296600342, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8784890174865723, + "num_tokens": 535778700.0, + "step": 14703 + }, + { + "epoch": 2.730547818012999, + "grad_norm": 1.3947702646255493, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8978605270385742, + "num_tokens": 535823082.0, + "step": 14704 + }, + { + "epoch": 2.730733519034355, + "grad_norm": 1.67774498462677, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8894168138504028, + "num_tokens": 535856751.0, + "step": 14705 + }, + { + "epoch": 2.73091922005571, + "grad_norm": 1.6970975399017334, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8740319609642029, + "num_tokens": 535892368.0, + "step": 14706 + }, + { + "epoch": 2.731104921077066, + "grad_norm": 1.6319745779037476, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8961920738220215, + "num_tokens": 535927166.0, + "step": 14707 + }, + { + "epoch": 2.7312906220984217, + "grad_norm": 1.6227165460586548, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8900865316390991, + "num_tokens": 535961172.0, + "step": 14708 + }, + { + "epoch": 2.731476323119777, + "grad_norm": 1.6859334707260132, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8693184852600098, + "num_tokens": 536001114.0, + "step": 14709 + }, + { + "epoch": 2.7316620241411327, + "grad_norm": 1.5622304677963257, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.881916880607605, + "num_tokens": 536039008.0, + "step": 14710 + }, + { + "epoch": 2.7318477251624884, + "grad_norm": 1.6403539180755615, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8742676973342896, + "num_tokens": 536076197.0, + "step": 14711 + }, + { + "epoch": 2.732033426183844, + "grad_norm": 1.7387053966522217, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8781173825263977, + "num_tokens": 536112790.0, + "step": 14712 + }, + { + "epoch": 2.7322191272052, + "grad_norm": 2.045179605484009, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8828085064888, + "num_tokens": 536152455.0, + "step": 14713 + }, + { + "epoch": 2.732404828226555, + "grad_norm": 1.6265252828598022, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8929714560508728, + "num_tokens": 536186931.0, + "step": 14714 + }, + { + "epoch": 2.732590529247911, + "grad_norm": 1.733958125114441, + "learning_rate": 1e-06, + "loss": 0.2713, + "mean_token_accuracy": 0.901594340801239, + "num_tokens": 536214951.0, + "step": 14715 + }, + { + "epoch": 2.7327762302692666, + "grad_norm": 1.746780276298523, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8744953870773315, + "num_tokens": 536249405.0, + "step": 14716 + }, + { + "epoch": 2.732961931290622, + "grad_norm": 1.6029977798461914, + "learning_rate": 1e-06, + "loss": 0.2379, + "mean_token_accuracy": 0.9100043773651123, + "num_tokens": 536279274.0, + "step": 14717 + }, + { + "epoch": 2.7331476323119777, + "grad_norm": 1.70183265209198, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8814641237258911, + "num_tokens": 536313604.0, + "step": 14718 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 1.5853304862976074, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8804929852485657, + "num_tokens": 536353769.0, + "step": 14719 + }, + { + "epoch": 2.733519034354689, + "grad_norm": 1.5312387943267822, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8849091529846191, + "num_tokens": 536392892.0, + "step": 14720 + }, + { + "epoch": 2.7337047353760444, + "grad_norm": 1.591511607170105, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.899677038192749, + "num_tokens": 536432077.0, + "step": 14721 + }, + { + "epoch": 2.7338904363974, + "grad_norm": 1.4989395141601562, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8759450912475586, + "num_tokens": 536476267.0, + "step": 14722 + }, + { + "epoch": 2.734076137418756, + "grad_norm": 1.5011441707611084, + "learning_rate": 1e-06, + "loss": 0.2641, + "mean_token_accuracy": 0.9055557250976562, + "num_tokens": 536514865.0, + "step": 14723 + }, + { + "epoch": 2.734261838440111, + "grad_norm": 1.6374804973602295, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8874820470809937, + "num_tokens": 536549410.0, + "step": 14724 + }, + { + "epoch": 2.734447539461467, + "grad_norm": 1.6875317096710205, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8814055919647217, + "num_tokens": 536585507.0, + "step": 14725 + }, + { + "epoch": 2.7346332404828226, + "grad_norm": 1.6889005899429321, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8785206079483032, + "num_tokens": 536619117.0, + "step": 14726 + }, + { + "epoch": 2.7348189415041784, + "grad_norm": 1.5273032188415527, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8852220773696899, + "num_tokens": 536658012.0, + "step": 14727 + }, + { + "epoch": 2.735004642525534, + "grad_norm": 1.6231812238693237, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8815004825592041, + "num_tokens": 536695699.0, + "step": 14728 + }, + { + "epoch": 2.7351903435468894, + "grad_norm": 1.5292096138000488, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8943527936935425, + "num_tokens": 536730715.0, + "step": 14729 + }, + { + "epoch": 2.735376044568245, + "grad_norm": 1.8024541139602661, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8812845349311829, + "num_tokens": 536763822.0, + "step": 14730 + }, + { + "epoch": 2.735561745589601, + "grad_norm": 1.6207810640335083, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8840917348861694, + "num_tokens": 536800926.0, + "step": 14731 + }, + { + "epoch": 2.735747446610956, + "grad_norm": 1.55442476272583, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.889945387840271, + "num_tokens": 536836241.0, + "step": 14732 + }, + { + "epoch": 2.735933147632312, + "grad_norm": 1.5775560140609741, + "learning_rate": 1e-06, + "loss": 0.2757, + "mean_token_accuracy": 0.8994078636169434, + "num_tokens": 536871314.0, + "step": 14733 + }, + { + "epoch": 2.7361188486536676, + "grad_norm": 1.6544878482818604, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.884097158908844, + "num_tokens": 536905315.0, + "step": 14734 + }, + { + "epoch": 2.7363045496750233, + "grad_norm": 1.584123134613037, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8960649967193604, + "num_tokens": 536939976.0, + "step": 14735 + }, + { + "epoch": 2.736490250696379, + "grad_norm": 1.5899912118911743, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8898845911026001, + "num_tokens": 536976620.0, + "step": 14736 + }, + { + "epoch": 2.7366759517177344, + "grad_norm": 1.6457279920578003, + "learning_rate": 1e-06, + "loss": 0.2771, + "mean_token_accuracy": 0.8990920782089233, + "num_tokens": 537007765.0, + "step": 14737 + }, + { + "epoch": 2.73686165273909, + "grad_norm": 1.5319050550460815, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8815547823905945, + "num_tokens": 537050067.0, + "step": 14738 + }, + { + "epoch": 2.737047353760446, + "grad_norm": 1.5440959930419922, + "learning_rate": 1e-06, + "loss": 0.2645, + "mean_token_accuracy": 0.9029589891433716, + "num_tokens": 537084164.0, + "step": 14739 + }, + { + "epoch": 2.737233054781801, + "grad_norm": 1.6522232294082642, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8841677904129028, + "num_tokens": 537119911.0, + "step": 14740 + }, + { + "epoch": 2.737418755803157, + "grad_norm": 1.6466110944747925, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8788778781890869, + "num_tokens": 537154811.0, + "step": 14741 + }, + { + "epoch": 2.7376044568245126, + "grad_norm": 1.5810439586639404, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8904333710670471, + "num_tokens": 537189338.0, + "step": 14742 + }, + { + "epoch": 2.7377901578458683, + "grad_norm": 1.657904863357544, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8812353014945984, + "num_tokens": 537227275.0, + "step": 14743 + }, + { + "epoch": 2.7379758588672236, + "grad_norm": 1.5796564817428589, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.8970725536346436, + "num_tokens": 537261138.0, + "step": 14744 + }, + { + "epoch": 2.7381615598885793, + "grad_norm": 1.4782301187515259, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8886235952377319, + "num_tokens": 537302047.0, + "step": 14745 + }, + { + "epoch": 2.738347260909935, + "grad_norm": 1.6540740728378296, + "learning_rate": 1e-06, + "loss": 0.262, + "mean_token_accuracy": 0.905744194984436, + "num_tokens": 537333208.0, + "step": 14746 + }, + { + "epoch": 2.7385329619312904, + "grad_norm": 1.653653621673584, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.882565975189209, + "num_tokens": 537372761.0, + "step": 14747 + }, + { + "epoch": 2.738718662952646, + "grad_norm": 1.6473013162612915, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8863093852996826, + "num_tokens": 537409886.0, + "step": 14748 + }, + { + "epoch": 2.738904363974002, + "grad_norm": 1.813996434211731, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.876050591468811, + "num_tokens": 537441994.0, + "step": 14749 + }, + { + "epoch": 2.7390900649953576, + "grad_norm": 1.4973862171173096, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8778949975967407, + "num_tokens": 537485768.0, + "step": 14750 + }, + { + "epoch": 2.7392757660167133, + "grad_norm": 1.7043105363845825, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8878528475761414, + "num_tokens": 537517985.0, + "step": 14751 + }, + { + "epoch": 2.7394614670380686, + "grad_norm": 1.6250507831573486, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8889137506484985, + "num_tokens": 537553121.0, + "step": 14752 + }, + { + "epoch": 2.7396471680594243, + "grad_norm": 1.5995213985443115, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8828150033950806, + "num_tokens": 537590018.0, + "step": 14753 + }, + { + "epoch": 2.73983286908078, + "grad_norm": 1.7892184257507324, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.878937840461731, + "num_tokens": 537622622.0, + "step": 14754 + }, + { + "epoch": 2.7400185701021353, + "grad_norm": 2.248387098312378, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8795758485794067, + "num_tokens": 537653986.0, + "step": 14755 + }, + { + "epoch": 2.740204271123491, + "grad_norm": 1.6092664003372192, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8950035572052002, + "num_tokens": 537693483.0, + "step": 14756 + }, + { + "epoch": 2.740389972144847, + "grad_norm": 1.5425852537155151, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8950325846672058, + "num_tokens": 537732684.0, + "step": 14757 + }, + { + "epoch": 2.7405756731662025, + "grad_norm": 1.5848585367202759, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8882229924201965, + "num_tokens": 537771054.0, + "step": 14758 + }, + { + "epoch": 2.7407613741875583, + "grad_norm": 1.473575234413147, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8842239379882812, + "num_tokens": 537812994.0, + "step": 14759 + }, + { + "epoch": 2.7409470752089136, + "grad_norm": 1.5797663927078247, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.9027122259140015, + "num_tokens": 537845381.0, + "step": 14760 + }, + { + "epoch": 2.7411327762302693, + "grad_norm": 1.523212194442749, + "learning_rate": 1e-06, + "loss": 0.2759, + "mean_token_accuracy": 0.8983300924301147, + "num_tokens": 537880931.0, + "step": 14761 + }, + { + "epoch": 2.741318477251625, + "grad_norm": 1.5520823001861572, + "learning_rate": 1e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9016609191894531, + "num_tokens": 537916570.0, + "step": 14762 + }, + { + "epoch": 2.7415041782729803, + "grad_norm": 1.56046462059021, + "learning_rate": 1e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.9012370109558105, + "num_tokens": 537951446.0, + "step": 14763 + }, + { + "epoch": 2.741689879294336, + "grad_norm": 1.5409072637557983, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8871846199035645, + "num_tokens": 537988426.0, + "step": 14764 + }, + { + "epoch": 2.741875580315692, + "grad_norm": 1.6377637386322021, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8932006359100342, + "num_tokens": 538022365.0, + "step": 14765 + }, + { + "epoch": 2.7420612813370475, + "grad_norm": 1.6499996185302734, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8901647329330444, + "num_tokens": 538061184.0, + "step": 14766 + }, + { + "epoch": 2.742246982358403, + "grad_norm": 1.709099292755127, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8947557210922241, + "num_tokens": 538094626.0, + "step": 14767 + }, + { + "epoch": 2.7424326833797585, + "grad_norm": 1.638355016708374, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8820230960845947, + "num_tokens": 538132429.0, + "step": 14768 + }, + { + "epoch": 2.7426183844011143, + "grad_norm": 1.6992138624191284, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8871011137962341, + "num_tokens": 538166498.0, + "step": 14769 + }, + { + "epoch": 2.7428040854224696, + "grad_norm": 1.6219379901885986, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8895086646080017, + "num_tokens": 538204034.0, + "step": 14770 + }, + { + "epoch": 2.7429897864438253, + "grad_norm": 1.6527926921844482, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8943195939064026, + "num_tokens": 538238223.0, + "step": 14771 + }, + { + "epoch": 2.743175487465181, + "grad_norm": 1.8299627304077148, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8905816674232483, + "num_tokens": 538266947.0, + "step": 14772 + }, + { + "epoch": 2.7433611884865368, + "grad_norm": 1.4735337495803833, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.88060462474823, + "num_tokens": 538310738.0, + "step": 14773 + }, + { + "epoch": 2.7435468895078925, + "grad_norm": 1.5969719886779785, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.892301619052887, + "num_tokens": 538346717.0, + "step": 14774 + }, + { + "epoch": 2.743732590529248, + "grad_norm": 1.7056560516357422, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8896960020065308, + "num_tokens": 538381571.0, + "step": 14775 + }, + { + "epoch": 2.7439182915506035, + "grad_norm": 1.620453953742981, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8805157542228699, + "num_tokens": 538419868.0, + "step": 14776 + }, + { + "epoch": 2.7441039925719592, + "grad_norm": 1.6500434875488281, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8763487339019775, + "num_tokens": 538456224.0, + "step": 14777 + }, + { + "epoch": 2.7442896935933145, + "grad_norm": 1.5926810503005981, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8893254995346069, + "num_tokens": 538490910.0, + "step": 14778 + }, + { + "epoch": 2.7444753946146703, + "grad_norm": 1.6322516202926636, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8815270662307739, + "num_tokens": 538528395.0, + "step": 14779 + }, + { + "epoch": 2.744661095636026, + "grad_norm": 1.6098496913909912, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8988856077194214, + "num_tokens": 538560832.0, + "step": 14780 + }, + { + "epoch": 2.7448467966573817, + "grad_norm": 1.580452799797058, + "learning_rate": 1e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.9003204107284546, + "num_tokens": 538594642.0, + "step": 14781 + }, + { + "epoch": 2.7450324976787375, + "grad_norm": 1.6534382104873657, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8772622346878052, + "num_tokens": 538629555.0, + "step": 14782 + }, + { + "epoch": 2.7452181987000928, + "grad_norm": 1.6793673038482666, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8759000897407532, + "num_tokens": 538664376.0, + "step": 14783 + }, + { + "epoch": 2.7454038997214485, + "grad_norm": 1.7247310876846313, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8752243518829346, + "num_tokens": 538701286.0, + "step": 14784 + }, + { + "epoch": 2.7455896007428042, + "grad_norm": 1.7662444114685059, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.889963686466217, + "num_tokens": 538731522.0, + "step": 14785 + }, + { + "epoch": 2.7457753017641595, + "grad_norm": 1.6008880138397217, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8922982215881348, + "num_tokens": 538767871.0, + "step": 14786 + }, + { + "epoch": 2.7459610027855152, + "grad_norm": 1.685645580291748, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8985099792480469, + "num_tokens": 538800060.0, + "step": 14787 + }, + { + "epoch": 2.746146703806871, + "grad_norm": 1.4134817123413086, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8944905400276184, + "num_tokens": 538840587.0, + "step": 14788 + }, + { + "epoch": 2.7463324048282267, + "grad_norm": 1.7039461135864258, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8794786930084229, + "num_tokens": 538877526.0, + "step": 14789 + }, + { + "epoch": 2.7465181058495824, + "grad_norm": 1.5687310695648193, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8814111948013306, + "num_tokens": 538915804.0, + "step": 14790 + }, + { + "epoch": 2.7467038068709377, + "grad_norm": 1.608323574066162, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8745124340057373, + "num_tokens": 538953264.0, + "step": 14791 + }, + { + "epoch": 2.7468895078922935, + "grad_norm": 1.668757677078247, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8951826691627502, + "num_tokens": 538986369.0, + "step": 14792 + }, + { + "epoch": 2.7470752089136488, + "grad_norm": 1.6365602016448975, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8780580759048462, + "num_tokens": 539021099.0, + "step": 14793 + }, + { + "epoch": 2.7472609099350045, + "grad_norm": 1.7136608362197876, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8758079409599304, + "num_tokens": 539053405.0, + "step": 14794 + }, + { + "epoch": 2.7474466109563602, + "grad_norm": 1.5206183195114136, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8826233148574829, + "num_tokens": 539094840.0, + "step": 14795 + }, + { + "epoch": 2.747632311977716, + "grad_norm": 1.5684044361114502, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8836112022399902, + "num_tokens": 539136981.0, + "step": 14796 + }, + { + "epoch": 2.7478180129990717, + "grad_norm": 1.563895583152771, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8894217014312744, + "num_tokens": 539172888.0, + "step": 14797 + }, + { + "epoch": 2.748003714020427, + "grad_norm": 1.6534600257873535, + "learning_rate": 1e-06, + "loss": 0.2563, + "mean_token_accuracy": 0.9065172672271729, + "num_tokens": 539202725.0, + "step": 14798 + }, + { + "epoch": 2.7481894150417827, + "grad_norm": 1.5839930772781372, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8872168064117432, + "num_tokens": 539241312.0, + "step": 14799 + }, + { + "epoch": 2.7483751160631384, + "grad_norm": 1.5207618474960327, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8942654132843018, + "num_tokens": 539278148.0, + "step": 14800 + }, + { + "epoch": 2.7485608170844937, + "grad_norm": 1.5865265130996704, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8716655969619751, + "num_tokens": 539317967.0, + "step": 14801 + }, + { + "epoch": 2.7487465181058495, + "grad_norm": 1.4348565340042114, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8950563669204712, + "num_tokens": 539359037.0, + "step": 14802 + }, + { + "epoch": 2.748932219127205, + "grad_norm": 1.5619127750396729, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8892104625701904, + "num_tokens": 539396089.0, + "step": 14803 + }, + { + "epoch": 2.749117920148561, + "grad_norm": 1.496312141418457, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.9003396034240723, + "num_tokens": 539433831.0, + "step": 14804 + }, + { + "epoch": 2.7493036211699167, + "grad_norm": 1.6524068117141724, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8873019218444824, + "num_tokens": 539468528.0, + "step": 14805 + }, + { + "epoch": 2.749489322191272, + "grad_norm": 1.4538938999176025, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8975429534912109, + "num_tokens": 539507451.0, + "step": 14806 + }, + { + "epoch": 2.7496750232126277, + "grad_norm": 1.6730008125305176, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8813602924346924, + "num_tokens": 539540287.0, + "step": 14807 + }, + { + "epoch": 2.7498607242339834, + "grad_norm": 1.5479344129562378, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8913187384605408, + "num_tokens": 539577485.0, + "step": 14808 + }, + { + "epoch": 2.7500464252553387, + "grad_norm": 1.461402416229248, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8922511339187622, + "num_tokens": 539620874.0, + "step": 14809 + }, + { + "epoch": 2.7502321262766944, + "grad_norm": 1.6663748025894165, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.894554615020752, + "num_tokens": 539651965.0, + "step": 14810 + }, + { + "epoch": 2.75041782729805, + "grad_norm": 1.5528732538223267, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8788847923278809, + "num_tokens": 539690561.0, + "step": 14811 + }, + { + "epoch": 2.750603528319406, + "grad_norm": 1.6012603044509888, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8872069120407104, + "num_tokens": 539727153.0, + "step": 14812 + }, + { + "epoch": 2.7507892293407616, + "grad_norm": 1.554001808166504, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8845508098602295, + "num_tokens": 539765389.0, + "step": 14813 + }, + { + "epoch": 2.750974930362117, + "grad_norm": 1.5330594778060913, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8878057599067688, + "num_tokens": 539807691.0, + "step": 14814 + }, + { + "epoch": 2.7511606313834727, + "grad_norm": 1.7439870834350586, + "learning_rate": 1e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.8989314436912537, + "num_tokens": 539835666.0, + "step": 14815 + }, + { + "epoch": 2.751346332404828, + "grad_norm": 1.6372981071472168, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8823415040969849, + "num_tokens": 539868891.0, + "step": 14816 + }, + { + "epoch": 2.7515320334261837, + "grad_norm": 1.5319143533706665, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8915970921516418, + "num_tokens": 539910288.0, + "step": 14817 + }, + { + "epoch": 2.7517177344475394, + "grad_norm": 1.4749881029129028, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8884292840957642, + "num_tokens": 539952294.0, + "step": 14818 + }, + { + "epoch": 2.751903435468895, + "grad_norm": 1.543043613433838, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8942765593528748, + "num_tokens": 539989742.0, + "step": 14819 + }, + { + "epoch": 2.752089136490251, + "grad_norm": 1.5172858238220215, + "learning_rate": 1e-06, + "loss": 0.28, + "mean_token_accuracy": 0.9007489681243896, + "num_tokens": 540027345.0, + "step": 14820 + }, + { + "epoch": 2.752274837511606, + "grad_norm": 1.7129954099655151, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8780419826507568, + "num_tokens": 540061740.0, + "step": 14821 + }, + { + "epoch": 2.752460538532962, + "grad_norm": 1.5649938583374023, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.888527512550354, + "num_tokens": 540099077.0, + "step": 14822 + }, + { + "epoch": 2.7526462395543176, + "grad_norm": 1.5818830728530884, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8883634805679321, + "num_tokens": 540136397.0, + "step": 14823 + }, + { + "epoch": 2.752831940575673, + "grad_norm": 1.6052500009536743, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.886682391166687, + "num_tokens": 540170260.0, + "step": 14824 + }, + { + "epoch": 2.7530176415970287, + "grad_norm": 1.499107837677002, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8938072323799133, + "num_tokens": 540206373.0, + "step": 14825 + }, + { + "epoch": 2.7532033426183844, + "grad_norm": 1.494682788848877, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8829975128173828, + "num_tokens": 540248386.0, + "step": 14826 + }, + { + "epoch": 2.75338904363974, + "grad_norm": 1.6566238403320312, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8808789849281311, + "num_tokens": 540282763.0, + "step": 14827 + }, + { + "epoch": 2.753574744661096, + "grad_norm": 1.8068643808364868, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8892143964767456, + "num_tokens": 540313292.0, + "step": 14828 + }, + { + "epoch": 2.753760445682451, + "grad_norm": 1.5021023750305176, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8801297545433044, + "num_tokens": 540357381.0, + "step": 14829 + }, + { + "epoch": 2.753946146703807, + "grad_norm": 1.7702782154083252, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8778702020645142, + "num_tokens": 540390682.0, + "step": 14830 + }, + { + "epoch": 2.7541318477251626, + "grad_norm": 1.4970844984054565, + "learning_rate": 1e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9018952250480652, + "num_tokens": 540433004.0, + "step": 14831 + }, + { + "epoch": 2.754317548746518, + "grad_norm": 1.694951057434082, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8651896119117737, + "num_tokens": 540472803.0, + "step": 14832 + }, + { + "epoch": 2.7545032497678736, + "grad_norm": 1.5414258241653442, + "learning_rate": 1e-06, + "loss": 0.2742, + "mean_token_accuracy": 0.8984439373016357, + "num_tokens": 540510953.0, + "step": 14833 + }, + { + "epoch": 2.7546889507892294, + "grad_norm": 1.6577420234680176, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8970914483070374, + "num_tokens": 540549278.0, + "step": 14834 + }, + { + "epoch": 2.754874651810585, + "grad_norm": 1.5809803009033203, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8789076805114746, + "num_tokens": 540586232.0, + "step": 14835 + }, + { + "epoch": 2.755060352831941, + "grad_norm": 1.7094461917877197, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8883445262908936, + "num_tokens": 540617866.0, + "step": 14836 + }, + { + "epoch": 2.755246053853296, + "grad_norm": 1.6299164295196533, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8879134654998779, + "num_tokens": 540656278.0, + "step": 14837 + }, + { + "epoch": 2.755431754874652, + "grad_norm": 1.727174162864685, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.894390344619751, + "num_tokens": 540688129.0, + "step": 14838 + }, + { + "epoch": 2.755617455896007, + "grad_norm": 1.6160444021224976, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8892093896865845, + "num_tokens": 540726320.0, + "step": 14839 + }, + { + "epoch": 2.755803156917363, + "grad_norm": 1.6806652545928955, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8933970332145691, + "num_tokens": 540760177.0, + "step": 14840 + }, + { + "epoch": 2.7559888579387186, + "grad_norm": 1.6289660930633545, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8902552127838135, + "num_tokens": 540798244.0, + "step": 14841 + }, + { + "epoch": 2.7561745589600744, + "grad_norm": 1.6241730451583862, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8784095644950867, + "num_tokens": 540832133.0, + "step": 14842 + }, + { + "epoch": 2.75636025998143, + "grad_norm": 1.5534366369247437, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8824355602264404, + "num_tokens": 540869258.0, + "step": 14843 + }, + { + "epoch": 2.7565459610027854, + "grad_norm": 1.5406986474990845, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8845362067222595, + "num_tokens": 540906507.0, + "step": 14844 + }, + { + "epoch": 2.756731662024141, + "grad_norm": 1.596517562866211, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8838917016983032, + "num_tokens": 540945316.0, + "step": 14845 + }, + { + "epoch": 2.756917363045497, + "grad_norm": 1.5380958318710327, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8786910772323608, + "num_tokens": 540987681.0, + "step": 14846 + }, + { + "epoch": 2.757103064066852, + "grad_norm": 1.5164488554000854, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8990541100502014, + "num_tokens": 541023620.0, + "step": 14847 + }, + { + "epoch": 2.757288765088208, + "grad_norm": 1.560984492301941, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8832628130912781, + "num_tokens": 541063420.0, + "step": 14848 + }, + { + "epoch": 2.7574744661095636, + "grad_norm": 1.6608268022537231, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8802515864372253, + "num_tokens": 541097326.0, + "step": 14849 + }, + { + "epoch": 2.7576601671309193, + "grad_norm": 1.628237009048462, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8813526630401611, + "num_tokens": 541135544.0, + "step": 14850 + }, + { + "epoch": 2.757845868152275, + "grad_norm": 1.6071667671203613, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8761559724807739, + "num_tokens": 541173858.0, + "step": 14851 + }, + { + "epoch": 2.7580315691736303, + "grad_norm": 1.6539617776870728, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8714842200279236, + "num_tokens": 541209944.0, + "step": 14852 + }, + { + "epoch": 2.758217270194986, + "grad_norm": 1.5139480829238892, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8827808499336243, + "num_tokens": 541253411.0, + "step": 14853 + }, + { + "epoch": 2.758402971216342, + "grad_norm": 1.6264218091964722, + "learning_rate": 1e-06, + "loss": 0.2769, + "mean_token_accuracy": 0.9012709259986877, + "num_tokens": 541285624.0, + "step": 14854 + }, + { + "epoch": 2.758588672237697, + "grad_norm": 1.519256591796875, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8897255063056946, + "num_tokens": 541326009.0, + "step": 14855 + }, + { + "epoch": 2.758774373259053, + "grad_norm": 1.5847198963165283, + "learning_rate": 1e-06, + "loss": 0.2742, + "mean_token_accuracy": 0.9021841883659363, + "num_tokens": 541361181.0, + "step": 14856 + }, + { + "epoch": 2.7589600742804086, + "grad_norm": 1.492064356803894, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8874003887176514, + "num_tokens": 541403372.0, + "step": 14857 + }, + { + "epoch": 2.7591457753017643, + "grad_norm": 1.6214525699615479, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.881924033164978, + "num_tokens": 541442437.0, + "step": 14858 + }, + { + "epoch": 2.75933147632312, + "grad_norm": 1.5357462167739868, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8920684456825256, + "num_tokens": 541477947.0, + "step": 14859 + }, + { + "epoch": 2.7595171773444753, + "grad_norm": 1.636888027191162, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8925929069519043, + "num_tokens": 541515078.0, + "step": 14860 + }, + { + "epoch": 2.759702878365831, + "grad_norm": 1.5787813663482666, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8863992691040039, + "num_tokens": 541553636.0, + "step": 14861 + }, + { + "epoch": 2.759888579387187, + "grad_norm": 1.8349149227142334, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8933191299438477, + "num_tokens": 541582700.0, + "step": 14862 + }, + { + "epoch": 2.760074280408542, + "grad_norm": 1.700059413909912, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8909562826156616, + "num_tokens": 541615882.0, + "step": 14863 + }, + { + "epoch": 2.760259981429898, + "grad_norm": 1.6275016069412231, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8847887516021729, + "num_tokens": 541654043.0, + "step": 14864 + }, + { + "epoch": 2.7604456824512535, + "grad_norm": 1.432823657989502, + "learning_rate": 1e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.8984018564224243, + "num_tokens": 541693489.0, + "step": 14865 + }, + { + "epoch": 2.7606313834726093, + "grad_norm": 1.51398503780365, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.898289680480957, + "num_tokens": 541733761.0, + "step": 14866 + }, + { + "epoch": 2.7608170844939646, + "grad_norm": 1.6248725652694702, + "learning_rate": 1e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.8998648524284363, + "num_tokens": 541765969.0, + "step": 14867 + }, + { + "epoch": 2.7610027855153203, + "grad_norm": 1.3444784879684448, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8906066417694092, + "num_tokens": 541811382.0, + "step": 14868 + }, + { + "epoch": 2.761188486536676, + "grad_norm": 1.5016485452651978, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8956204652786255, + "num_tokens": 541848013.0, + "step": 14869 + }, + { + "epoch": 2.7613741875580313, + "grad_norm": 1.5553908348083496, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8847854137420654, + "num_tokens": 541885976.0, + "step": 14870 + }, + { + "epoch": 2.761559888579387, + "grad_norm": 1.626570701599121, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8770708441734314, + "num_tokens": 541926633.0, + "step": 14871 + }, + { + "epoch": 2.761745589600743, + "grad_norm": 1.5322387218475342, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.878496527671814, + "num_tokens": 541967658.0, + "step": 14872 + }, + { + "epoch": 2.7619312906220985, + "grad_norm": 1.7303903102874756, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.883150577545166, + "num_tokens": 542000852.0, + "step": 14873 + }, + { + "epoch": 2.7621169916434543, + "grad_norm": 1.690915584564209, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8736271858215332, + "num_tokens": 542036541.0, + "step": 14874 + }, + { + "epoch": 2.7623026926648095, + "grad_norm": 1.6465773582458496, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8825663328170776, + "num_tokens": 542072552.0, + "step": 14875 + }, + { + "epoch": 2.7624883936861653, + "grad_norm": 1.643332839012146, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8879162669181824, + "num_tokens": 542108208.0, + "step": 14876 + }, + { + "epoch": 2.762674094707521, + "grad_norm": 1.593196988105774, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.896198570728302, + "num_tokens": 542144405.0, + "step": 14877 + }, + { + "epoch": 2.7628597957288763, + "grad_norm": 1.4859987497329712, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8863584995269775, + "num_tokens": 542188351.0, + "step": 14878 + }, + { + "epoch": 2.763045496750232, + "grad_norm": 1.5234180688858032, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8824732303619385, + "num_tokens": 542232011.0, + "step": 14879 + }, + { + "epoch": 2.7632311977715878, + "grad_norm": 1.6093518733978271, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8917203545570374, + "num_tokens": 542267467.0, + "step": 14880 + }, + { + "epoch": 2.7634168987929435, + "grad_norm": 1.6109119653701782, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8726577758789062, + "num_tokens": 542309159.0, + "step": 14881 + }, + { + "epoch": 2.7636025998142992, + "grad_norm": 1.6638305187225342, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8829561471939087, + "num_tokens": 542342414.0, + "step": 14882 + }, + { + "epoch": 2.7637883008356545, + "grad_norm": 1.668682336807251, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8771929144859314, + "num_tokens": 542379144.0, + "step": 14883 + }, + { + "epoch": 2.7639740018570103, + "grad_norm": 1.6003340482711792, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8863581418991089, + "num_tokens": 542415840.0, + "step": 14884 + }, + { + "epoch": 2.764159702878366, + "grad_norm": 1.512405514717102, + "learning_rate": 1e-06, + "loss": 0.2668, + "mean_token_accuracy": 0.9021260738372803, + "num_tokens": 542451086.0, + "step": 14885 + }, + { + "epoch": 2.7643454038997213, + "grad_norm": 1.5547912120819092, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8832255601882935, + "num_tokens": 542489067.0, + "step": 14886 + }, + { + "epoch": 2.764531104921077, + "grad_norm": 1.6349493265151978, + "learning_rate": 1e-06, + "loss": 0.2732, + "mean_token_accuracy": 0.9010686874389648, + "num_tokens": 542522364.0, + "step": 14887 + }, + { + "epoch": 2.7647168059424327, + "grad_norm": 1.6020488739013672, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8897322416305542, + "num_tokens": 542554637.0, + "step": 14888 + }, + { + "epoch": 2.7649025069637885, + "grad_norm": 1.594801902770996, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8848689198493958, + "num_tokens": 542590796.0, + "step": 14889 + }, + { + "epoch": 2.7650882079851438, + "grad_norm": 1.5526989698410034, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8971271514892578, + "num_tokens": 542628077.0, + "step": 14890 + }, + { + "epoch": 2.7652739090064995, + "grad_norm": 1.6913948059082031, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8974906206130981, + "num_tokens": 542657999.0, + "step": 14891 + }, + { + "epoch": 2.7654596100278552, + "grad_norm": 1.6438720226287842, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.890330970287323, + "num_tokens": 542693994.0, + "step": 14892 + }, + { + "epoch": 2.7656453110492105, + "grad_norm": 1.7181349992752075, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8879989385604858, + "num_tokens": 542729033.0, + "step": 14893 + }, + { + "epoch": 2.7658310120705663, + "grad_norm": 1.8196815252304077, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8699213862419128, + "num_tokens": 542760619.0, + "step": 14894 + }, + { + "epoch": 2.766016713091922, + "grad_norm": 1.5713447332382202, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8998894691467285, + "num_tokens": 542798001.0, + "step": 14895 + }, + { + "epoch": 2.7662024141132777, + "grad_norm": 1.6728925704956055, + "learning_rate": 1e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.9015021324157715, + "num_tokens": 542830726.0, + "step": 14896 + }, + { + "epoch": 2.7663881151346335, + "grad_norm": 1.7172220945358276, + "learning_rate": 1e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.8989318013191223, + "num_tokens": 542862481.0, + "step": 14897 + }, + { + "epoch": 2.7665738161559887, + "grad_norm": 1.5676881074905396, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8963596820831299, + "num_tokens": 542897947.0, + "step": 14898 + }, + { + "epoch": 2.7667595171773445, + "grad_norm": 1.52566659450531, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8911347389221191, + "num_tokens": 542937253.0, + "step": 14899 + }, + { + "epoch": 2.7669452181987, + "grad_norm": 1.5401115417480469, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8715551495552063, + "num_tokens": 542977430.0, + "step": 14900 + }, + { + "epoch": 2.7671309192200555, + "grad_norm": 1.4965054988861084, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8946224451065063, + "num_tokens": 543016185.0, + "step": 14901 + }, + { + "epoch": 2.7673166202414112, + "grad_norm": 1.5958682298660278, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8826149702072144, + "num_tokens": 543053496.0, + "step": 14902 + }, + { + "epoch": 2.767502321262767, + "grad_norm": 1.500816822052002, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8834551572799683, + "num_tokens": 543093228.0, + "step": 14903 + }, + { + "epoch": 2.7676880222841227, + "grad_norm": 1.5914161205291748, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8874441385269165, + "num_tokens": 543132759.0, + "step": 14904 + }, + { + "epoch": 2.7678737233054784, + "grad_norm": 1.3953912258148193, + "learning_rate": 1e-06, + "loss": 0.28, + "mean_token_accuracy": 0.899864673614502, + "num_tokens": 543176320.0, + "step": 14905 + }, + { + "epoch": 2.7680594243268337, + "grad_norm": 1.6738165616989136, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8909733295440674, + "num_tokens": 543209417.0, + "step": 14906 + }, + { + "epoch": 2.7682451253481895, + "grad_norm": 1.642905354499817, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8794569969177246, + "num_tokens": 543246575.0, + "step": 14907 + }, + { + "epoch": 2.768430826369545, + "grad_norm": 1.4638540744781494, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.890178382396698, + "num_tokens": 543289365.0, + "step": 14908 + }, + { + "epoch": 2.7686165273909005, + "grad_norm": 1.5793033838272095, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8972121477127075, + "num_tokens": 543324489.0, + "step": 14909 + }, + { + "epoch": 2.768802228412256, + "grad_norm": 1.4605839252471924, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.8987882137298584, + "num_tokens": 543366299.0, + "step": 14910 + }, + { + "epoch": 2.768987929433612, + "grad_norm": 1.8133361339569092, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.861880362033844, + "num_tokens": 543403128.0, + "step": 14911 + }, + { + "epoch": 2.7691736304549677, + "grad_norm": 1.6399517059326172, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8913936614990234, + "num_tokens": 543439001.0, + "step": 14912 + }, + { + "epoch": 2.769359331476323, + "grad_norm": 1.6115139722824097, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8821253776550293, + "num_tokens": 543479911.0, + "step": 14913 + }, + { + "epoch": 2.7695450324976787, + "grad_norm": 1.568477749824524, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.8977314829826355, + "num_tokens": 543514594.0, + "step": 14914 + }, + { + "epoch": 2.7697307335190344, + "grad_norm": 1.8558138608932495, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8633043766021729, + "num_tokens": 543550557.0, + "step": 14915 + }, + { + "epoch": 2.7699164345403897, + "grad_norm": 1.5482497215270996, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.885245680809021, + "num_tokens": 543588747.0, + "step": 14916 + }, + { + "epoch": 2.7701021355617454, + "grad_norm": 1.441056728363037, + "learning_rate": 1e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.9023956060409546, + "num_tokens": 543628046.0, + "step": 14917 + }, + { + "epoch": 2.770287836583101, + "grad_norm": 1.544142723083496, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8848434090614319, + "num_tokens": 543665909.0, + "step": 14918 + }, + { + "epoch": 2.770473537604457, + "grad_norm": 1.7088115215301514, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.880754828453064, + "num_tokens": 543703026.0, + "step": 14919 + }, + { + "epoch": 2.7706592386258126, + "grad_norm": 1.6742088794708252, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8690601587295532, + "num_tokens": 543739511.0, + "step": 14920 + }, + { + "epoch": 2.770844939647168, + "grad_norm": 1.6949729919433594, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8910282850265503, + "num_tokens": 543769782.0, + "step": 14921 + }, + { + "epoch": 2.7710306406685237, + "grad_norm": 1.6919152736663818, + "learning_rate": 1e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.9007363319396973, + "num_tokens": 543800299.0, + "step": 14922 + }, + { + "epoch": 2.7712163416898794, + "grad_norm": 1.6368823051452637, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8942105770111084, + "num_tokens": 543836000.0, + "step": 14923 + }, + { + "epoch": 2.7714020427112347, + "grad_norm": 1.7847976684570312, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8731510043144226, + "num_tokens": 543867421.0, + "step": 14924 + }, + { + "epoch": 2.7715877437325904, + "grad_norm": 1.692577838897705, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8863636255264282, + "num_tokens": 543901346.0, + "step": 14925 + }, + { + "epoch": 2.771773444753946, + "grad_norm": 1.6932919025421143, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.873795747756958, + "num_tokens": 543935562.0, + "step": 14926 + }, + { + "epoch": 2.771959145775302, + "grad_norm": 1.7449190616607666, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8869345188140869, + "num_tokens": 543966792.0, + "step": 14927 + }, + { + "epoch": 2.7721448467966576, + "grad_norm": 1.7895104885101318, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8684481382369995, + "num_tokens": 544001897.0, + "step": 14928 + }, + { + "epoch": 2.772330547818013, + "grad_norm": 1.643456220626831, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8874993324279785, + "num_tokens": 544037355.0, + "step": 14929 + }, + { + "epoch": 2.7725162488393686, + "grad_norm": 1.4620444774627686, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8905292749404907, + "num_tokens": 544079156.0, + "step": 14930 + }, + { + "epoch": 2.7727019498607244, + "grad_norm": 1.5020095109939575, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8968180418014526, + "num_tokens": 544120526.0, + "step": 14931 + }, + { + "epoch": 2.7728876508820797, + "grad_norm": 1.4410275220870972, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8977694511413574, + "num_tokens": 544160038.0, + "step": 14932 + }, + { + "epoch": 2.7730733519034354, + "grad_norm": 1.5847917795181274, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8887909650802612, + "num_tokens": 544195865.0, + "step": 14933 + }, + { + "epoch": 2.773259052924791, + "grad_norm": 1.6416850090026855, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8766172528266907, + "num_tokens": 544233049.0, + "step": 14934 + }, + { + "epoch": 2.773444753946147, + "grad_norm": 1.6741418838500977, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8765561580657959, + "num_tokens": 544269615.0, + "step": 14935 + }, + { + "epoch": 2.773630454967502, + "grad_norm": 1.4850026369094849, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8947746753692627, + "num_tokens": 544306263.0, + "step": 14936 + }, + { + "epoch": 2.773816155988858, + "grad_norm": 1.910560965538025, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8868429064750671, + "num_tokens": 544335524.0, + "step": 14937 + }, + { + "epoch": 2.7740018570102136, + "grad_norm": 2.000364303588867, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8895519375801086, + "num_tokens": 544362381.0, + "step": 14938 + }, + { + "epoch": 2.774187558031569, + "grad_norm": 1.7200933694839478, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8533698320388794, + "num_tokens": 544400068.0, + "step": 14939 + }, + { + "epoch": 2.7743732590529246, + "grad_norm": 1.7096184492111206, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.886701762676239, + "num_tokens": 544433972.0, + "step": 14940 + }, + { + "epoch": 2.7745589600742804, + "grad_norm": 1.5486217737197876, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8812509775161743, + "num_tokens": 544473564.0, + "step": 14941 + }, + { + "epoch": 2.774744661095636, + "grad_norm": 1.730050802230835, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8717137575149536, + "num_tokens": 544508080.0, + "step": 14942 + }, + { + "epoch": 2.774930362116992, + "grad_norm": 1.6806328296661377, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8802730441093445, + "num_tokens": 544544670.0, + "step": 14943 + }, + { + "epoch": 2.775116063138347, + "grad_norm": 1.6689437627792358, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8935102820396423, + "num_tokens": 544583627.0, + "step": 14944 + }, + { + "epoch": 2.775301764159703, + "grad_norm": 1.4824309349060059, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8862260580062866, + "num_tokens": 544626690.0, + "step": 14945 + }, + { + "epoch": 2.7754874651810586, + "grad_norm": 1.7781447172164917, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8898142576217651, + "num_tokens": 544662462.0, + "step": 14946 + }, + { + "epoch": 2.775673166202414, + "grad_norm": 1.629055380821228, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8933013677597046, + "num_tokens": 544699319.0, + "step": 14947 + }, + { + "epoch": 2.7758588672237696, + "grad_norm": 1.7401776313781738, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8875254988670349, + "num_tokens": 544730497.0, + "step": 14948 + }, + { + "epoch": 2.7760445682451254, + "grad_norm": 1.4910123348236084, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8938444256782532, + "num_tokens": 544773576.0, + "step": 14949 + }, + { + "epoch": 2.776230269266481, + "grad_norm": 1.6109352111816406, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8871859908103943, + "num_tokens": 544812497.0, + "step": 14950 + }, + { + "epoch": 2.776415970287837, + "grad_norm": 1.5635905265808105, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8839523792266846, + "num_tokens": 544852358.0, + "step": 14951 + }, + { + "epoch": 2.776601671309192, + "grad_norm": 1.6511595249176025, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8800414204597473, + "num_tokens": 544888560.0, + "step": 14952 + }, + { + "epoch": 2.776787372330548, + "grad_norm": 1.6621321439743042, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8781936168670654, + "num_tokens": 544925697.0, + "step": 14953 + }, + { + "epoch": 2.7769730733519036, + "grad_norm": 1.6386096477508545, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8883107900619507, + "num_tokens": 544962205.0, + "step": 14954 + }, + { + "epoch": 2.777158774373259, + "grad_norm": 1.6403926610946655, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8907971382141113, + "num_tokens": 544997725.0, + "step": 14955 + }, + { + "epoch": 2.7773444753946146, + "grad_norm": 1.5562752485275269, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8905632495880127, + "num_tokens": 545035373.0, + "step": 14956 + }, + { + "epoch": 2.7775301764159703, + "grad_norm": 1.5356580018997192, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8920875787734985, + "num_tokens": 545072701.0, + "step": 14957 + }, + { + "epoch": 2.777715877437326, + "grad_norm": 1.4198230504989624, + "learning_rate": 1e-06, + "loss": 0.2649, + "mean_token_accuracy": 0.9024499654769897, + "num_tokens": 545109384.0, + "step": 14958 + }, + { + "epoch": 2.777901578458682, + "grad_norm": 1.6394680738449097, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8829913139343262, + "num_tokens": 545146973.0, + "step": 14959 + }, + { + "epoch": 2.778087279480037, + "grad_norm": 1.584561824798584, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8912115097045898, + "num_tokens": 545184018.0, + "step": 14960 + }, + { + "epoch": 2.778272980501393, + "grad_norm": 1.6390509605407715, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8949460387229919, + "num_tokens": 545218116.0, + "step": 14961 + }, + { + "epoch": 2.778458681522748, + "grad_norm": 1.7367640733718872, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8776427507400513, + "num_tokens": 545252562.0, + "step": 14962 + }, + { + "epoch": 2.778644382544104, + "grad_norm": 1.4563543796539307, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8817604780197144, + "num_tokens": 545296170.0, + "step": 14963 + }, + { + "epoch": 2.7788300835654596, + "grad_norm": 1.5463732481002808, + "learning_rate": 1e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9012367129325867, + "num_tokens": 545330858.0, + "step": 14964 + }, + { + "epoch": 2.7790157845868153, + "grad_norm": 1.8479642868041992, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8825963735580444, + "num_tokens": 545364066.0, + "step": 14965 + }, + { + "epoch": 2.779201485608171, + "grad_norm": 1.655064582824707, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8909961581230164, + "num_tokens": 545396799.0, + "step": 14966 + }, + { + "epoch": 2.7793871866295263, + "grad_norm": 1.4766050577163696, + "learning_rate": 1e-06, + "loss": 0.257, + "mean_token_accuracy": 0.9044853448867798, + "num_tokens": 545436286.0, + "step": 14967 + }, + { + "epoch": 2.779572887650882, + "grad_norm": 1.5488665103912354, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8932741284370422, + "num_tokens": 545473891.0, + "step": 14968 + }, + { + "epoch": 2.779758588672238, + "grad_norm": 1.580326795578003, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8886911869049072, + "num_tokens": 545511161.0, + "step": 14969 + }, + { + "epoch": 2.779944289693593, + "grad_norm": 1.617878794670105, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8893272876739502, + "num_tokens": 545545483.0, + "step": 14970 + }, + { + "epoch": 2.780129990714949, + "grad_norm": 1.6926096677780151, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8671238422393799, + "num_tokens": 545581837.0, + "step": 14971 + }, + { + "epoch": 2.7803156917363046, + "grad_norm": 1.6854465007781982, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8735520839691162, + "num_tokens": 545618817.0, + "step": 14972 + }, + { + "epoch": 2.7805013927576603, + "grad_norm": 1.6091747283935547, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8903717994689941, + "num_tokens": 545653004.0, + "step": 14973 + }, + { + "epoch": 2.780687093779016, + "grad_norm": 1.6819409132003784, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8867660760879517, + "num_tokens": 545686186.0, + "step": 14974 + }, + { + "epoch": 2.7808727948003713, + "grad_norm": 1.6393191814422607, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8892888426780701, + "num_tokens": 545724339.0, + "step": 14975 + }, + { + "epoch": 2.781058495821727, + "grad_norm": 1.7448757886886597, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8778318762779236, + "num_tokens": 545762983.0, + "step": 14976 + }, + { + "epoch": 2.7812441968430828, + "grad_norm": 1.5503747463226318, + "learning_rate": 1e-06, + "loss": 0.27, + "mean_token_accuracy": 0.900120735168457, + "num_tokens": 545801669.0, + "step": 14977 + }, + { + "epoch": 2.781429897864438, + "grad_norm": 1.511540412902832, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8819901347160339, + "num_tokens": 545840314.0, + "step": 14978 + }, + { + "epoch": 2.781615598885794, + "grad_norm": 1.6551964282989502, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8898022174835205, + "num_tokens": 545874999.0, + "step": 14979 + }, + { + "epoch": 2.7818012999071495, + "grad_norm": 1.6203399896621704, + "learning_rate": 1e-06, + "loss": 0.2717, + "mean_token_accuracy": 0.9012019038200378, + "num_tokens": 545906699.0, + "step": 14980 + }, + { + "epoch": 2.7819870009285053, + "grad_norm": 1.5160330533981323, + "learning_rate": 1e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.8988288640975952, + "num_tokens": 545943372.0, + "step": 14981 + }, + { + "epoch": 2.782172701949861, + "grad_norm": 1.675576090812683, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8824924230575562, + "num_tokens": 545977149.0, + "step": 14982 + }, + { + "epoch": 2.7823584029712163, + "grad_norm": 1.7147433757781982, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8847578763961792, + "num_tokens": 546007523.0, + "step": 14983 + }, + { + "epoch": 2.782544103992572, + "grad_norm": 1.5917541980743408, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.888596773147583, + "num_tokens": 546047555.0, + "step": 14984 + }, + { + "epoch": 2.7827298050139273, + "grad_norm": 1.5270748138427734, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8898824453353882, + "num_tokens": 546089479.0, + "step": 14985 + }, + { + "epoch": 2.782915506035283, + "grad_norm": 1.749174952507019, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8737530708312988, + "num_tokens": 546121587.0, + "step": 14986 + }, + { + "epoch": 2.7831012070566388, + "grad_norm": 1.5327016115188599, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8815575838088989, + "num_tokens": 546163198.0, + "step": 14987 + }, + { + "epoch": 2.7832869080779945, + "grad_norm": 1.8471696376800537, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8822928071022034, + "num_tokens": 546196768.0, + "step": 14988 + }, + { + "epoch": 2.7834726090993502, + "grad_norm": 1.5583126544952393, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8847643733024597, + "num_tokens": 546235830.0, + "step": 14989 + }, + { + "epoch": 2.7836583101207055, + "grad_norm": 1.6591770648956299, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8768967390060425, + "num_tokens": 546273297.0, + "step": 14990 + }, + { + "epoch": 2.7838440111420613, + "grad_norm": 1.6074705123901367, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8717159032821655, + "num_tokens": 546312352.0, + "step": 14991 + }, + { + "epoch": 2.784029712163417, + "grad_norm": 1.604494571685791, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.870715856552124, + "num_tokens": 546354569.0, + "step": 14992 + }, + { + "epoch": 2.7842154131847723, + "grad_norm": 1.6632803678512573, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8859971165657043, + "num_tokens": 546389955.0, + "step": 14993 + }, + { + "epoch": 2.784401114206128, + "grad_norm": 1.6868504285812378, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8882877826690674, + "num_tokens": 546425936.0, + "step": 14994 + }, + { + "epoch": 2.7845868152274837, + "grad_norm": 1.5880180597305298, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.8975623250007629, + "num_tokens": 546460525.0, + "step": 14995 + }, + { + "epoch": 2.7847725162488395, + "grad_norm": 1.4374306201934814, + "learning_rate": 1e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.8959516286849976, + "num_tokens": 546504497.0, + "step": 14996 + }, + { + "epoch": 2.784958217270195, + "grad_norm": 1.5908300876617432, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8830558061599731, + "num_tokens": 546540832.0, + "step": 14997 + }, + { + "epoch": 2.7851439182915505, + "grad_norm": 1.6168112754821777, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8959933519363403, + "num_tokens": 546575733.0, + "step": 14998 + }, + { + "epoch": 2.7853296193129062, + "grad_norm": 1.374244213104248, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8897686004638672, + "num_tokens": 546623968.0, + "step": 14999 + }, + { + "epoch": 2.785515320334262, + "grad_norm": 1.4957685470581055, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8906561732292175, + "num_tokens": 546663526.0, + "step": 15000 + }, + { + "epoch": 2.7857010213556173, + "grad_norm": 1.8201206922531128, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8869335651397705, + "num_tokens": 546697252.0, + "step": 15001 + }, + { + "epoch": 2.785886722376973, + "grad_norm": 1.4609148502349854, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8908196091651917, + "num_tokens": 546736544.0, + "step": 15002 + }, + { + "epoch": 2.7860724233983287, + "grad_norm": 1.5786678791046143, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8795601725578308, + "num_tokens": 546776100.0, + "step": 15003 + }, + { + "epoch": 2.7862581244196845, + "grad_norm": 1.5637643337249756, + "learning_rate": 1e-06, + "loss": 0.2631, + "mean_token_accuracy": 0.9035662412643433, + "num_tokens": 546811927.0, + "step": 15004 + }, + { + "epoch": 2.78644382544104, + "grad_norm": 1.4387849569320679, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.882129430770874, + "num_tokens": 546854240.0, + "step": 15005 + }, + { + "epoch": 2.7866295264623955, + "grad_norm": 1.569067120552063, + "learning_rate": 1e-06, + "loss": 0.2755, + "mean_token_accuracy": 0.9006814360618591, + "num_tokens": 546885893.0, + "step": 15006 + }, + { + "epoch": 2.786815227483751, + "grad_norm": 1.6544429063796997, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8849285244941711, + "num_tokens": 546920782.0, + "step": 15007 + }, + { + "epoch": 2.7870009285051065, + "grad_norm": 1.65814208984375, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8943865299224854, + "num_tokens": 546954528.0, + "step": 15008 + }, + { + "epoch": 2.7871866295264622, + "grad_norm": 1.6627388000488281, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8898698091506958, + "num_tokens": 546992513.0, + "step": 15009 + }, + { + "epoch": 2.787372330547818, + "grad_norm": 1.6699292659759521, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8889833092689514, + "num_tokens": 547027708.0, + "step": 15010 + }, + { + "epoch": 2.7875580315691737, + "grad_norm": 1.5130720138549805, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8849190473556519, + "num_tokens": 547067094.0, + "step": 15011 + }, + { + "epoch": 2.7877437325905294, + "grad_norm": 1.5721229314804077, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8920317888259888, + "num_tokens": 547106758.0, + "step": 15012 + }, + { + "epoch": 2.7879294336118847, + "grad_norm": 1.5326213836669922, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8780124187469482, + "num_tokens": 547149896.0, + "step": 15013 + }, + { + "epoch": 2.7881151346332405, + "grad_norm": 1.5756255388259888, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8881217241287231, + "num_tokens": 547190419.0, + "step": 15014 + }, + { + "epoch": 2.788300835654596, + "grad_norm": 1.6467217206954956, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.870536208152771, + "num_tokens": 547229109.0, + "step": 15015 + }, + { + "epoch": 2.7884865366759515, + "grad_norm": 1.6369032859802246, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8786838054656982, + "num_tokens": 547266252.0, + "step": 15016 + }, + { + "epoch": 2.788672237697307, + "grad_norm": 1.8111385107040405, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8813205361366272, + "num_tokens": 547299611.0, + "step": 15017 + }, + { + "epoch": 2.788857938718663, + "grad_norm": 1.5222442150115967, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8880942463874817, + "num_tokens": 547340257.0, + "step": 15018 + }, + { + "epoch": 2.7890436397400187, + "grad_norm": 1.5976266860961914, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8884098529815674, + "num_tokens": 547380575.0, + "step": 15019 + }, + { + "epoch": 2.7892293407613744, + "grad_norm": 1.6155050992965698, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8890881538391113, + "num_tokens": 547422561.0, + "step": 15020 + }, + { + "epoch": 2.7894150417827297, + "grad_norm": 1.673187494277954, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.8969171047210693, + "num_tokens": 547456627.0, + "step": 15021 + }, + { + "epoch": 2.7896007428040854, + "grad_norm": 1.6098344326019287, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8794522285461426, + "num_tokens": 547492681.0, + "step": 15022 + }, + { + "epoch": 2.789786443825441, + "grad_norm": 1.561400055885315, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8937808275222778, + "num_tokens": 547529522.0, + "step": 15023 + }, + { + "epoch": 2.7899721448467965, + "grad_norm": 1.5830377340316772, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.9000880718231201, + "num_tokens": 547566749.0, + "step": 15024 + }, + { + "epoch": 2.790157845868152, + "grad_norm": 1.5365794897079468, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8789560794830322, + "num_tokens": 547605801.0, + "step": 15025 + }, + { + "epoch": 2.790343546889508, + "grad_norm": 1.733420491218567, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8848663568496704, + "num_tokens": 547642116.0, + "step": 15026 + }, + { + "epoch": 2.7905292479108637, + "grad_norm": 1.571389079093933, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.890436053276062, + "num_tokens": 547679970.0, + "step": 15027 + }, + { + "epoch": 2.7907149489322194, + "grad_norm": 1.6151511669158936, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8878871202468872, + "num_tokens": 547716673.0, + "step": 15028 + }, + { + "epoch": 2.7909006499535747, + "grad_norm": 1.5800001621246338, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8842480182647705, + "num_tokens": 547756196.0, + "step": 15029 + }, + { + "epoch": 2.7910863509749304, + "grad_norm": 1.7461868524551392, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8766523003578186, + "num_tokens": 547787902.0, + "step": 15030 + }, + { + "epoch": 2.791272051996286, + "grad_norm": 1.7255195379257202, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.896093487739563, + "num_tokens": 547820734.0, + "step": 15031 + }, + { + "epoch": 2.7914577530176414, + "grad_norm": 1.725666880607605, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.886520504951477, + "num_tokens": 547857530.0, + "step": 15032 + }, + { + "epoch": 2.791643454038997, + "grad_norm": 1.580536127090454, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8822565078735352, + "num_tokens": 547897713.0, + "step": 15033 + }, + { + "epoch": 2.791829155060353, + "grad_norm": 1.5387099981307983, + "learning_rate": 1e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9062169194221497, + "num_tokens": 547932471.0, + "step": 15034 + }, + { + "epoch": 2.7920148560817086, + "grad_norm": 1.6104562282562256, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8790806531906128, + "num_tokens": 547971129.0, + "step": 15035 + }, + { + "epoch": 2.792200557103064, + "grad_norm": 1.4939186573028564, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8941981792449951, + "num_tokens": 548009780.0, + "step": 15036 + }, + { + "epoch": 2.7923862581244197, + "grad_norm": 1.6161737442016602, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8909504413604736, + "num_tokens": 548044794.0, + "step": 15037 + }, + { + "epoch": 2.7925719591457754, + "grad_norm": 1.6485464572906494, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8844380378723145, + "num_tokens": 548083500.0, + "step": 15038 + }, + { + "epoch": 2.7927576601671307, + "grad_norm": 1.5403990745544434, + "learning_rate": 1e-06, + "loss": 0.2522, + "mean_token_accuracy": 0.9082186222076416, + "num_tokens": 548116652.0, + "step": 15039 + }, + { + "epoch": 2.7929433611884864, + "grad_norm": 1.6026500463485718, + "learning_rate": 1e-06, + "loss": 0.2685, + "mean_token_accuracy": 0.9022200703620911, + "num_tokens": 548151056.0, + "step": 15040 + }, + { + "epoch": 2.793129062209842, + "grad_norm": 1.5449713468551636, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8787577152252197, + "num_tokens": 548188664.0, + "step": 15041 + }, + { + "epoch": 2.793314763231198, + "grad_norm": 1.6110308170318604, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8781912326812744, + "num_tokens": 548227510.0, + "step": 15042 + }, + { + "epoch": 2.7935004642525536, + "grad_norm": 1.6055103540420532, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8931430578231812, + "num_tokens": 548265854.0, + "step": 15043 + }, + { + "epoch": 2.793686165273909, + "grad_norm": 1.6500632762908936, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8818132281303406, + "num_tokens": 548300009.0, + "step": 15044 + }, + { + "epoch": 2.7938718662952646, + "grad_norm": 1.6609723567962646, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8798197507858276, + "num_tokens": 548339139.0, + "step": 15045 + }, + { + "epoch": 2.7940575673166204, + "grad_norm": 1.5689153671264648, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8956992626190186, + "num_tokens": 548372784.0, + "step": 15046 + }, + { + "epoch": 2.7942432683379756, + "grad_norm": 1.6100521087646484, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8917520046234131, + "num_tokens": 548406487.0, + "step": 15047 + }, + { + "epoch": 2.7944289693593314, + "grad_norm": 1.7156982421875, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8790102601051331, + "num_tokens": 548441791.0, + "step": 15048 + }, + { + "epoch": 2.794614670380687, + "grad_norm": 1.8068689107894897, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8951615691184998, + "num_tokens": 548469297.0, + "step": 15049 + }, + { + "epoch": 2.794800371402043, + "grad_norm": 1.6628849506378174, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.89678955078125, + "num_tokens": 548498479.0, + "step": 15050 + }, + { + "epoch": 2.7949860724233986, + "grad_norm": 1.654798984527588, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8943594098091125, + "num_tokens": 548530839.0, + "step": 15051 + }, + { + "epoch": 2.795171773444754, + "grad_norm": 1.6248040199279785, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8905948400497437, + "num_tokens": 548566169.0, + "step": 15052 + }, + { + "epoch": 2.7953574744661096, + "grad_norm": 1.5117610692977905, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8943982124328613, + "num_tokens": 548603796.0, + "step": 15053 + }, + { + "epoch": 2.7955431754874653, + "grad_norm": 1.6108849048614502, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8918488621711731, + "num_tokens": 548636395.0, + "step": 15054 + }, + { + "epoch": 2.7957288765088206, + "grad_norm": 1.5399000644683838, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8876031637191772, + "num_tokens": 548676683.0, + "step": 15055 + }, + { + "epoch": 2.7959145775301764, + "grad_norm": 1.5763449668884277, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8849704265594482, + "num_tokens": 548712591.0, + "step": 15056 + }, + { + "epoch": 2.796100278551532, + "grad_norm": 1.6324325799942017, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8864588141441345, + "num_tokens": 548749023.0, + "step": 15057 + }, + { + "epoch": 2.796285979572888, + "grad_norm": 1.507856011390686, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8815164566040039, + "num_tokens": 548790942.0, + "step": 15058 + }, + { + "epoch": 2.796471680594243, + "grad_norm": 1.5095481872558594, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8913280963897705, + "num_tokens": 548830460.0, + "step": 15059 + }, + { + "epoch": 2.796657381615599, + "grad_norm": 1.6669648885726929, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8872517347335815, + "num_tokens": 548862694.0, + "step": 15060 + }, + { + "epoch": 2.7968430826369546, + "grad_norm": 1.6013107299804688, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8975974321365356, + "num_tokens": 548897570.0, + "step": 15061 + }, + { + "epoch": 2.79702878365831, + "grad_norm": 1.7024849653244019, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8896012902259827, + "num_tokens": 548928942.0, + "step": 15062 + }, + { + "epoch": 2.7972144846796656, + "grad_norm": 1.6431533098220825, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8806155920028687, + "num_tokens": 548965196.0, + "step": 15063 + }, + { + "epoch": 2.7974001857010213, + "grad_norm": 1.603119134902954, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8906992673873901, + "num_tokens": 549000397.0, + "step": 15064 + }, + { + "epoch": 2.797585886722377, + "grad_norm": 1.6835254430770874, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.867947518825531, + "num_tokens": 549037885.0, + "step": 15065 + }, + { + "epoch": 2.797771587743733, + "grad_norm": 1.6467223167419434, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8938220739364624, + "num_tokens": 549072509.0, + "step": 15066 + }, + { + "epoch": 2.797957288765088, + "grad_norm": 1.6447288990020752, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8879174590110779, + "num_tokens": 549110156.0, + "step": 15067 + }, + { + "epoch": 2.798142989786444, + "grad_norm": 1.7705711126327515, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8738760948181152, + "num_tokens": 549144739.0, + "step": 15068 + }, + { + "epoch": 2.7983286908077996, + "grad_norm": 1.6823358535766602, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8722803592681885, + "num_tokens": 549180696.0, + "step": 15069 + }, + { + "epoch": 2.798514391829155, + "grad_norm": 1.5766687393188477, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8844789862632751, + "num_tokens": 549221024.0, + "step": 15070 + }, + { + "epoch": 2.7987000928505106, + "grad_norm": 1.6515296697616577, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8871736526489258, + "num_tokens": 549256407.0, + "step": 15071 + }, + { + "epoch": 2.7988857938718663, + "grad_norm": 1.561651587486267, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8886998891830444, + "num_tokens": 549296475.0, + "step": 15072 + }, + { + "epoch": 2.799071494893222, + "grad_norm": 1.5791908502578735, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8793416619300842, + "num_tokens": 549334170.0, + "step": 15073 + }, + { + "epoch": 2.799257195914578, + "grad_norm": 1.433828592300415, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8926597237586975, + "num_tokens": 549378399.0, + "step": 15074 + }, + { + "epoch": 2.799442896935933, + "grad_norm": 1.5951273441314697, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8860096335411072, + "num_tokens": 549419499.0, + "step": 15075 + }, + { + "epoch": 2.799628597957289, + "grad_norm": 1.7332932949066162, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.880537748336792, + "num_tokens": 549450337.0, + "step": 15076 + }, + { + "epoch": 2.7998142989786445, + "grad_norm": 1.6740378141403198, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8833404779434204, + "num_tokens": 549483776.0, + "step": 15077 + }, + { + "epoch": 2.8, + "grad_norm": 1.605978012084961, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8793889284133911, + "num_tokens": 549519920.0, + "step": 15078 + }, + { + "epoch": 2.8001857010213556, + "grad_norm": 1.6707009077072144, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8806787729263306, + "num_tokens": 549552409.0, + "step": 15079 + }, + { + "epoch": 2.8003714020427113, + "grad_norm": 1.5468984842300415, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8959164619445801, + "num_tokens": 549588831.0, + "step": 15080 + }, + { + "epoch": 2.800557103064067, + "grad_norm": 1.5738639831542969, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8914043307304382, + "num_tokens": 549626411.0, + "step": 15081 + }, + { + "epoch": 2.8007428040854223, + "grad_norm": 1.5078259706497192, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8923631906509399, + "num_tokens": 549664036.0, + "step": 15082 + }, + { + "epoch": 2.800928505106778, + "grad_norm": 1.538018822669983, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8859195709228516, + "num_tokens": 549702551.0, + "step": 15083 + }, + { + "epoch": 2.8011142061281338, + "grad_norm": 1.4977898597717285, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8975735902786255, + "num_tokens": 549743299.0, + "step": 15084 + }, + { + "epoch": 2.801299907149489, + "grad_norm": 1.7420393228530884, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8762830495834351, + "num_tokens": 549776674.0, + "step": 15085 + }, + { + "epoch": 2.801485608170845, + "grad_norm": 1.4867300987243652, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.895857572555542, + "num_tokens": 549815365.0, + "step": 15086 + }, + { + "epoch": 2.8016713091922005, + "grad_norm": 1.613520622253418, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.882118821144104, + "num_tokens": 549854919.0, + "step": 15087 + }, + { + "epoch": 2.8018570102135563, + "grad_norm": 1.404565453529358, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.896604597568512, + "num_tokens": 549896104.0, + "step": 15088 + }, + { + "epoch": 2.802042711234912, + "grad_norm": 1.6016135215759277, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.892192006111145, + "num_tokens": 549932826.0, + "step": 15089 + }, + { + "epoch": 2.8022284122562673, + "grad_norm": 1.563040018081665, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8739385604858398, + "num_tokens": 549972005.0, + "step": 15090 + }, + { + "epoch": 2.802414113277623, + "grad_norm": 1.697300910949707, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8936976194381714, + "num_tokens": 550006410.0, + "step": 15091 + }, + { + "epoch": 2.8025998142989788, + "grad_norm": 1.4598067998886108, + "learning_rate": 1e-06, + "loss": 0.2755, + "mean_token_accuracy": 0.8992782831192017, + "num_tokens": 550045841.0, + "step": 15092 + }, + { + "epoch": 2.802785515320334, + "grad_norm": 1.6128425598144531, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8876904249191284, + "num_tokens": 550081904.0, + "step": 15093 + }, + { + "epoch": 2.8029712163416898, + "grad_norm": 1.6658494472503662, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8862839937210083, + "num_tokens": 550115697.0, + "step": 15094 + }, + { + "epoch": 2.8031569173630455, + "grad_norm": 1.5514240264892578, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8903244137763977, + "num_tokens": 550150269.0, + "step": 15095 + }, + { + "epoch": 2.8033426183844012, + "grad_norm": 1.6852002143859863, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8913512229919434, + "num_tokens": 550183453.0, + "step": 15096 + }, + { + "epoch": 2.803528319405757, + "grad_norm": 1.493486762046814, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8889610767364502, + "num_tokens": 550225615.0, + "step": 15097 + }, + { + "epoch": 2.8037140204271123, + "grad_norm": 1.734330654144287, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8893725872039795, + "num_tokens": 550256052.0, + "step": 15098 + }, + { + "epoch": 2.803899721448468, + "grad_norm": 1.6443557739257812, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8796353340148926, + "num_tokens": 550293742.0, + "step": 15099 + }, + { + "epoch": 2.8040854224698237, + "grad_norm": 1.435477375984192, + "learning_rate": 1e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.8993995785713196, + "num_tokens": 550331682.0, + "step": 15100 + }, + { + "epoch": 2.804271123491179, + "grad_norm": 1.7529016733169556, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8764916658401489, + "num_tokens": 550364574.0, + "step": 15101 + }, + { + "epoch": 2.8044568245125348, + "grad_norm": 1.7180424928665161, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8825327754020691, + "num_tokens": 550404496.0, + "step": 15102 + }, + { + "epoch": 2.8046425255338905, + "grad_norm": 1.740313172340393, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8932732343673706, + "num_tokens": 550441624.0, + "step": 15103 + }, + { + "epoch": 2.804828226555246, + "grad_norm": 1.615800142288208, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8777515292167664, + "num_tokens": 550478761.0, + "step": 15104 + }, + { + "epoch": 2.8050139275766015, + "grad_norm": 1.6114153861999512, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8992766737937927, + "num_tokens": 550514174.0, + "step": 15105 + }, + { + "epoch": 2.8051996285979572, + "grad_norm": 1.6033819913864136, + "learning_rate": 1e-06, + "loss": 0.28, + "mean_token_accuracy": 0.8978183269500732, + "num_tokens": 550548963.0, + "step": 15106 + }, + { + "epoch": 2.805385329619313, + "grad_norm": 1.736858606338501, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.882807731628418, + "num_tokens": 550581969.0, + "step": 15107 + }, + { + "epoch": 2.8055710306406683, + "grad_norm": 1.4155659675598145, + "learning_rate": 1e-06, + "loss": 0.284, + "mean_token_accuracy": 0.897893488407135, + "num_tokens": 550621077.0, + "step": 15108 + }, + { + "epoch": 2.805756731662024, + "grad_norm": 1.5546637773513794, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.892917275428772, + "num_tokens": 550658269.0, + "step": 15109 + }, + { + "epoch": 2.8059424326833797, + "grad_norm": 1.568295955657959, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8828192949295044, + "num_tokens": 550695117.0, + "step": 15110 + }, + { + "epoch": 2.8061281337047355, + "grad_norm": 1.5190644264221191, + "learning_rate": 1e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9039038419723511, + "num_tokens": 550731374.0, + "step": 15111 + }, + { + "epoch": 2.806313834726091, + "grad_norm": 1.491426706314087, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8919072151184082, + "num_tokens": 550772406.0, + "step": 15112 + }, + { + "epoch": 2.8064995357474465, + "grad_norm": 1.6315693855285645, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8899588584899902, + "num_tokens": 550807322.0, + "step": 15113 + }, + { + "epoch": 2.806685236768802, + "grad_norm": 1.5975289344787598, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.877719521522522, + "num_tokens": 550846849.0, + "step": 15114 + }, + { + "epoch": 2.806870937790158, + "grad_norm": 1.6070767641067505, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8931195139884949, + "num_tokens": 550882689.0, + "step": 15115 + }, + { + "epoch": 2.8070566388115132, + "grad_norm": 1.6493173837661743, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8838964104652405, + "num_tokens": 550918824.0, + "step": 15116 + }, + { + "epoch": 2.807242339832869, + "grad_norm": 1.554657220840454, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8946065902709961, + "num_tokens": 550956422.0, + "step": 15117 + }, + { + "epoch": 2.8074280408542247, + "grad_norm": 1.4939838647842407, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8939918875694275, + "num_tokens": 550993072.0, + "step": 15118 + }, + { + "epoch": 2.8076137418755804, + "grad_norm": 1.5324580669403076, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8953840732574463, + "num_tokens": 551030804.0, + "step": 15119 + }, + { + "epoch": 2.807799442896936, + "grad_norm": 1.356286883354187, + "learning_rate": 1e-06, + "loss": 0.2615, + "mean_token_accuracy": 0.9056781530380249, + "num_tokens": 551076572.0, + "step": 15120 + }, + { + "epoch": 2.8079851439182915, + "grad_norm": 1.614084005355835, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8923952579498291, + "num_tokens": 551112027.0, + "step": 15121 + }, + { + "epoch": 2.808170844939647, + "grad_norm": 1.656996488571167, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8834282159805298, + "num_tokens": 551149218.0, + "step": 15122 + }, + { + "epoch": 2.808356545961003, + "grad_norm": 1.7616815567016602, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8830591440200806, + "num_tokens": 551180479.0, + "step": 15123 + }, + { + "epoch": 2.808542246982358, + "grad_norm": 1.4350253343582153, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.8982577323913574, + "num_tokens": 551219237.0, + "step": 15124 + }, + { + "epoch": 2.808727948003714, + "grad_norm": 1.595513939857483, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8954980373382568, + "num_tokens": 551254919.0, + "step": 15125 + }, + { + "epoch": 2.8089136490250697, + "grad_norm": 1.5570943355560303, + "learning_rate": 1e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.9023809432983398, + "num_tokens": 551290607.0, + "step": 15126 + }, + { + "epoch": 2.8090993500464254, + "grad_norm": 1.615303874015808, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8864641189575195, + "num_tokens": 551325595.0, + "step": 15127 + }, + { + "epoch": 2.809285051067781, + "grad_norm": 1.5590976476669312, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8806370496749878, + "num_tokens": 551366368.0, + "step": 15128 + }, + { + "epoch": 2.8094707520891364, + "grad_norm": 1.8000408411026, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8573687076568604, + "num_tokens": 551401049.0, + "step": 15129 + }, + { + "epoch": 2.809656453110492, + "grad_norm": 1.7198550701141357, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8888801336288452, + "num_tokens": 551435338.0, + "step": 15130 + }, + { + "epoch": 2.8098421541318475, + "grad_norm": 1.5971651077270508, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8760271072387695, + "num_tokens": 551475009.0, + "step": 15131 + }, + { + "epoch": 2.810027855153203, + "grad_norm": 1.690371036529541, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8732801675796509, + "num_tokens": 551509429.0, + "step": 15132 + }, + { + "epoch": 2.810213556174559, + "grad_norm": 1.5030255317687988, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8895865678787231, + "num_tokens": 551544685.0, + "step": 15133 + }, + { + "epoch": 2.8103992571959147, + "grad_norm": 1.566635251045227, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8795514106750488, + "num_tokens": 551583650.0, + "step": 15134 + }, + { + "epoch": 2.8105849582172704, + "grad_norm": 1.5485345125198364, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8973870277404785, + "num_tokens": 551617724.0, + "step": 15135 + }, + { + "epoch": 2.8107706592386257, + "grad_norm": 1.5096195936203003, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8896067142486572, + "num_tokens": 551656165.0, + "step": 15136 + }, + { + "epoch": 2.8109563602599814, + "grad_norm": 1.5023748874664307, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.894167423248291, + "num_tokens": 551692930.0, + "step": 15137 + }, + { + "epoch": 2.811142061281337, + "grad_norm": 1.5370240211486816, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8714827299118042, + "num_tokens": 551735036.0, + "step": 15138 + }, + { + "epoch": 2.8113277623026924, + "grad_norm": 1.682799220085144, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8789384365081787, + "num_tokens": 551769635.0, + "step": 15139 + }, + { + "epoch": 2.811513463324048, + "grad_norm": 1.4974448680877686, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8953894972801208, + "num_tokens": 551810682.0, + "step": 15140 + }, + { + "epoch": 2.811699164345404, + "grad_norm": 1.5523604154586792, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8956592082977295, + "num_tokens": 551847777.0, + "step": 15141 + }, + { + "epoch": 2.8118848653667596, + "grad_norm": 1.407874584197998, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8984110355377197, + "num_tokens": 551890449.0, + "step": 15142 + }, + { + "epoch": 2.8120705663881154, + "grad_norm": 1.6672481298446655, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8825398683547974, + "num_tokens": 551923594.0, + "step": 15143 + }, + { + "epoch": 2.8122562674094707, + "grad_norm": 1.5682713985443115, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8856919407844543, + "num_tokens": 551962676.0, + "step": 15144 + }, + { + "epoch": 2.8124419684308264, + "grad_norm": 1.6636910438537598, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.889485776424408, + "num_tokens": 551997956.0, + "step": 15145 + }, + { + "epoch": 2.812627669452182, + "grad_norm": 1.5835798978805542, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8911676406860352, + "num_tokens": 552033749.0, + "step": 15146 + }, + { + "epoch": 2.8128133704735374, + "grad_norm": 2.0763230323791504, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8841061592102051, + "num_tokens": 552074326.0, + "step": 15147 + }, + { + "epoch": 2.812999071494893, + "grad_norm": 1.6039044857025146, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8781696557998657, + "num_tokens": 552109896.0, + "step": 15148 + }, + { + "epoch": 2.813184772516249, + "grad_norm": 1.552625298500061, + "learning_rate": 1e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.898029625415802, + "num_tokens": 552146938.0, + "step": 15149 + }, + { + "epoch": 2.8133704735376046, + "grad_norm": 1.8009918928146362, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8965055346488953, + "num_tokens": 552178074.0, + "step": 15150 + }, + { + "epoch": 2.8135561745589603, + "grad_norm": 1.6019984483718872, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8886389136314392, + "num_tokens": 552217502.0, + "step": 15151 + }, + { + "epoch": 2.8137418755803156, + "grad_norm": 1.623167634010315, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8974034786224365, + "num_tokens": 552254894.0, + "step": 15152 + }, + { + "epoch": 2.8139275766016714, + "grad_norm": 1.6988554000854492, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8852011561393738, + "num_tokens": 552289359.0, + "step": 15153 + }, + { + "epoch": 2.8141132776230267, + "grad_norm": 1.5071436166763306, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8936105966567993, + "num_tokens": 552328483.0, + "step": 15154 + }, + { + "epoch": 2.8142989786443824, + "grad_norm": 1.7357064485549927, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8832398653030396, + "num_tokens": 552358971.0, + "step": 15155 + }, + { + "epoch": 2.814484679665738, + "grad_norm": 1.687735676765442, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8863369226455688, + "num_tokens": 552395547.0, + "step": 15156 + }, + { + "epoch": 2.814670380687094, + "grad_norm": 1.6010957956314087, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8948434591293335, + "num_tokens": 552430917.0, + "step": 15157 + }, + { + "epoch": 2.8148560817084496, + "grad_norm": 1.6356974840164185, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8926464915275574, + "num_tokens": 552462122.0, + "step": 15158 + }, + { + "epoch": 2.815041782729805, + "grad_norm": 1.4952374696731567, + "learning_rate": 1e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.8976655006408691, + "num_tokens": 552498017.0, + "step": 15159 + }, + { + "epoch": 2.8152274837511606, + "grad_norm": 1.6397126913070679, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8754626512527466, + "num_tokens": 552533965.0, + "step": 15160 + }, + { + "epoch": 2.8154131847725163, + "grad_norm": 1.53730309009552, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8893795609474182, + "num_tokens": 552572144.0, + "step": 15161 + }, + { + "epoch": 2.8155988857938716, + "grad_norm": 1.5328762531280518, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.871182918548584, + "num_tokens": 552615582.0, + "step": 15162 + }, + { + "epoch": 2.8157845868152274, + "grad_norm": 1.5006150007247925, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8939836025238037, + "num_tokens": 552658110.0, + "step": 15163 + }, + { + "epoch": 2.815970287836583, + "grad_norm": 1.874480128288269, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8769663572311401, + "num_tokens": 552690869.0, + "step": 15164 + }, + { + "epoch": 2.816155988857939, + "grad_norm": 1.5260077714920044, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8813791275024414, + "num_tokens": 552732286.0, + "step": 15165 + }, + { + "epoch": 2.8163416898792946, + "grad_norm": 1.6270127296447754, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8869601488113403, + "num_tokens": 552765003.0, + "step": 15166 + }, + { + "epoch": 2.81652739090065, + "grad_norm": 1.6185115575790405, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.89351886510849, + "num_tokens": 552798307.0, + "step": 15167 + }, + { + "epoch": 2.8167130919220056, + "grad_norm": 1.6315462589263916, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8811253905296326, + "num_tokens": 552834843.0, + "step": 15168 + }, + { + "epoch": 2.8168987929433613, + "grad_norm": 1.6140873432159424, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8619420528411865, + "num_tokens": 552878962.0, + "step": 15169 + }, + { + "epoch": 2.8170844939647166, + "grad_norm": 1.6824675798416138, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8793129920959473, + "num_tokens": 552915384.0, + "step": 15170 + }, + { + "epoch": 2.8172701949860723, + "grad_norm": 1.5920692682266235, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8837652802467346, + "num_tokens": 552955171.0, + "step": 15171 + }, + { + "epoch": 2.817455896007428, + "grad_norm": 1.7983845472335815, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8839733600616455, + "num_tokens": 552987345.0, + "step": 15172 + }, + { + "epoch": 2.817641597028784, + "grad_norm": 1.704630970954895, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8681369423866272, + "num_tokens": 553022483.0, + "step": 15173 + }, + { + "epoch": 2.8178272980501395, + "grad_norm": 1.3901106119155884, + "learning_rate": 1e-06, + "loss": 0.2593, + "mean_token_accuracy": 0.9034038186073303, + "num_tokens": 553064046.0, + "step": 15174 + }, + { + "epoch": 2.818012999071495, + "grad_norm": 1.5739563703536987, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8840563297271729, + "num_tokens": 553101500.0, + "step": 15175 + }, + { + "epoch": 2.8181987000928506, + "grad_norm": 1.5719804763793945, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.875917911529541, + "num_tokens": 553143908.0, + "step": 15176 + }, + { + "epoch": 2.8183844011142063, + "grad_norm": 1.6678214073181152, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8846665620803833, + "num_tokens": 553182710.0, + "step": 15177 + }, + { + "epoch": 2.8185701021355616, + "grad_norm": 1.7429155111312866, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8824334144592285, + "num_tokens": 553216080.0, + "step": 15178 + }, + { + "epoch": 2.8187558031569173, + "grad_norm": 1.6448407173156738, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8857475519180298, + "num_tokens": 553253650.0, + "step": 15179 + }, + { + "epoch": 2.818941504178273, + "grad_norm": 1.6312299966812134, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8778978586196899, + "num_tokens": 553288599.0, + "step": 15180 + }, + { + "epoch": 2.819127205199629, + "grad_norm": 1.5399378538131714, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.880929708480835, + "num_tokens": 553329330.0, + "step": 15181 + }, + { + "epoch": 2.819312906220984, + "grad_norm": 1.5875060558319092, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8800407648086548, + "num_tokens": 553370137.0, + "step": 15182 + }, + { + "epoch": 2.81949860724234, + "grad_norm": 1.5622514486312866, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.881409764289856, + "num_tokens": 553409369.0, + "step": 15183 + }, + { + "epoch": 2.8196843082636955, + "grad_norm": 1.8775062561035156, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8790923357009888, + "num_tokens": 553439241.0, + "step": 15184 + }, + { + "epoch": 2.819870009285051, + "grad_norm": 1.532671332359314, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8849638104438782, + "num_tokens": 553480978.0, + "step": 15185 + }, + { + "epoch": 2.8200557103064066, + "grad_norm": 1.6396085023880005, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8805328607559204, + "num_tokens": 553519655.0, + "step": 15186 + }, + { + "epoch": 2.8202414113277623, + "grad_norm": 1.524222493171692, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8885498642921448, + "num_tokens": 553559947.0, + "step": 15187 + }, + { + "epoch": 2.820427112349118, + "grad_norm": 1.6532039642333984, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.894465982913971, + "num_tokens": 553594653.0, + "step": 15188 + }, + { + "epoch": 2.8206128133704738, + "grad_norm": 1.537258505821228, + "learning_rate": 1e-06, + "loss": 0.2755, + "mean_token_accuracy": 0.8988964557647705, + "num_tokens": 553627368.0, + "step": 15189 + }, + { + "epoch": 2.820798514391829, + "grad_norm": 1.5245277881622314, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8895466327667236, + "num_tokens": 553667743.0, + "step": 15190 + }, + { + "epoch": 2.820984215413185, + "grad_norm": 1.6191726922988892, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8668352365493774, + "num_tokens": 553706626.0, + "step": 15191 + }, + { + "epoch": 2.8211699164345405, + "grad_norm": 1.6408982276916504, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8751848936080933, + "num_tokens": 553744875.0, + "step": 15192 + }, + { + "epoch": 2.821355617455896, + "grad_norm": 1.6030031442642212, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8814601898193359, + "num_tokens": 553782380.0, + "step": 15193 + }, + { + "epoch": 2.8215413184772515, + "grad_norm": 1.526822805404663, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8942430019378662, + "num_tokens": 553820450.0, + "step": 15194 + }, + { + "epoch": 2.8217270194986073, + "grad_norm": 1.8014434576034546, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8790098428726196, + "num_tokens": 553851682.0, + "step": 15195 + }, + { + "epoch": 2.821912720519963, + "grad_norm": 1.5696860551834106, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8992248773574829, + "num_tokens": 553886320.0, + "step": 15196 + }, + { + "epoch": 2.8220984215413187, + "grad_norm": 1.61921226978302, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8925841450691223, + "num_tokens": 553922169.0, + "step": 15197 + }, + { + "epoch": 2.822284122562674, + "grad_norm": 1.701371669769287, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8805199861526489, + "num_tokens": 553959214.0, + "step": 15198 + }, + { + "epoch": 2.8224698235840298, + "grad_norm": 1.8814692497253418, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8758634924888611, + "num_tokens": 553990383.0, + "step": 15199 + }, + { + "epoch": 2.8226555246053855, + "grad_norm": 1.6033978462219238, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8796212673187256, + "num_tokens": 554031883.0, + "step": 15200 + }, + { + "epoch": 2.822841225626741, + "grad_norm": 1.6751662492752075, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8855485916137695, + "num_tokens": 554065476.0, + "step": 15201 + }, + { + "epoch": 2.8230269266480965, + "grad_norm": 1.7509567737579346, + "learning_rate": 1e-06, + "loss": 0.2808, + "mean_token_accuracy": 0.896511435508728, + "num_tokens": 554093246.0, + "step": 15202 + }, + { + "epoch": 2.8232126276694522, + "grad_norm": 1.6747848987579346, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8830066919326782, + "num_tokens": 554128674.0, + "step": 15203 + }, + { + "epoch": 2.823398328690808, + "grad_norm": 1.5941022634506226, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8941258192062378, + "num_tokens": 554165731.0, + "step": 15204 + }, + { + "epoch": 2.8235840297121633, + "grad_norm": 1.6907868385314941, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.882225751876831, + "num_tokens": 554198509.0, + "step": 15205 + }, + { + "epoch": 2.823769730733519, + "grad_norm": 1.6211977005004883, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.889897346496582, + "num_tokens": 554232137.0, + "step": 15206 + }, + { + "epoch": 2.8239554317548747, + "grad_norm": 1.6410748958587646, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.886116087436676, + "num_tokens": 554269246.0, + "step": 15207 + }, + { + "epoch": 2.82414113277623, + "grad_norm": 1.6885353326797485, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8834905028343201, + "num_tokens": 554300759.0, + "step": 15208 + }, + { + "epoch": 2.8243268337975858, + "grad_norm": 1.6155213117599487, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8809047937393188, + "num_tokens": 554339752.0, + "step": 15209 + }, + { + "epoch": 2.8245125348189415, + "grad_norm": 1.6881436109542847, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8890076875686646, + "num_tokens": 554374782.0, + "step": 15210 + }, + { + "epoch": 2.8246982358402972, + "grad_norm": 1.7046953439712524, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8955563306808472, + "num_tokens": 554406939.0, + "step": 15211 + }, + { + "epoch": 2.824883936861653, + "grad_norm": 1.603710651397705, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.8968074321746826, + "num_tokens": 554439258.0, + "step": 15212 + }, + { + "epoch": 2.8250696378830082, + "grad_norm": 1.7444570064544678, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8789582848548889, + "num_tokens": 554475054.0, + "step": 15213 + }, + { + "epoch": 2.825255338904364, + "grad_norm": 1.694878101348877, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8865118026733398, + "num_tokens": 554509055.0, + "step": 15214 + }, + { + "epoch": 2.8254410399257197, + "grad_norm": 1.7516381740570068, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8842421174049377, + "num_tokens": 554542583.0, + "step": 15215 + }, + { + "epoch": 2.825626740947075, + "grad_norm": 1.6609110832214355, + "learning_rate": 1e-06, + "loss": 0.2635, + "mean_token_accuracy": 0.907021164894104, + "num_tokens": 554573146.0, + "step": 15216 + }, + { + "epoch": 2.8258124419684307, + "grad_norm": 1.5695849657058716, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8899492621421814, + "num_tokens": 554611125.0, + "step": 15217 + }, + { + "epoch": 2.8259981429897865, + "grad_norm": 1.5637630224227905, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8958749771118164, + "num_tokens": 554647436.0, + "step": 15218 + }, + { + "epoch": 2.826183844011142, + "grad_norm": 1.6597154140472412, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.86221843957901, + "num_tokens": 554687345.0, + "step": 15219 + }, + { + "epoch": 2.826369545032498, + "grad_norm": 1.8412842750549316, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8845388293266296, + "num_tokens": 554720428.0, + "step": 15220 + }, + { + "epoch": 2.826555246053853, + "grad_norm": 1.7200735807418823, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8741567730903625, + "num_tokens": 554755524.0, + "step": 15221 + }, + { + "epoch": 2.826740947075209, + "grad_norm": 1.7134698629379272, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8955384492874146, + "num_tokens": 554787033.0, + "step": 15222 + }, + { + "epoch": 2.8269266480965647, + "grad_norm": 1.743377685546875, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8717645406723022, + "num_tokens": 554819928.0, + "step": 15223 + }, + { + "epoch": 2.82711234911792, + "grad_norm": 1.6339842081069946, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8658027648925781, + "num_tokens": 554861182.0, + "step": 15224 + }, + { + "epoch": 2.8272980501392757, + "grad_norm": 1.66571044921875, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8860363960266113, + "num_tokens": 554894690.0, + "step": 15225 + }, + { + "epoch": 2.8274837511606314, + "grad_norm": 1.6313456296920776, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8952162265777588, + "num_tokens": 554931567.0, + "step": 15226 + }, + { + "epoch": 2.827669452181987, + "grad_norm": 1.6470428705215454, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8863965272903442, + "num_tokens": 554972416.0, + "step": 15227 + }, + { + "epoch": 2.8278551532033425, + "grad_norm": 1.5546317100524902, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8877150416374207, + "num_tokens": 555014838.0, + "step": 15228 + }, + { + "epoch": 2.828040854224698, + "grad_norm": 1.5514994859695435, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8880363702774048, + "num_tokens": 555055965.0, + "step": 15229 + }, + { + "epoch": 2.828226555246054, + "grad_norm": 1.7702544927597046, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8817011117935181, + "num_tokens": 555085461.0, + "step": 15230 + }, + { + "epoch": 2.828412256267409, + "grad_norm": 1.722206473350525, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.875499963760376, + "num_tokens": 555121985.0, + "step": 15231 + }, + { + "epoch": 2.828597957288765, + "grad_norm": 1.7220052480697632, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8798595070838928, + "num_tokens": 555157156.0, + "step": 15232 + }, + { + "epoch": 2.8287836583101207, + "grad_norm": 1.7014447450637817, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8889573812484741, + "num_tokens": 555192425.0, + "step": 15233 + }, + { + "epoch": 2.8289693593314764, + "grad_norm": 1.8085650205612183, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.881899356842041, + "num_tokens": 555221824.0, + "step": 15234 + }, + { + "epoch": 2.829155060352832, + "grad_norm": 1.687044382095337, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8870471119880676, + "num_tokens": 555254134.0, + "step": 15235 + }, + { + "epoch": 2.8293407613741874, + "grad_norm": 1.708611249923706, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8852261304855347, + "num_tokens": 555291233.0, + "step": 15236 + }, + { + "epoch": 2.829526462395543, + "grad_norm": 1.564077377319336, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8903817534446716, + "num_tokens": 555333040.0, + "step": 15237 + }, + { + "epoch": 2.829712163416899, + "grad_norm": 1.6300796270370483, + "learning_rate": 1e-06, + "loss": 0.283, + "mean_token_accuracy": 0.8977497220039368, + "num_tokens": 555365722.0, + "step": 15238 + }, + { + "epoch": 2.829897864438254, + "grad_norm": 1.6299182176589966, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8821218013763428, + "num_tokens": 555401712.0, + "step": 15239 + }, + { + "epoch": 2.83008356545961, + "grad_norm": 1.5979045629501343, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8857231140136719, + "num_tokens": 555441236.0, + "step": 15240 + }, + { + "epoch": 2.8302692664809657, + "grad_norm": 1.5819247961044312, + "learning_rate": 1e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.8994075059890747, + "num_tokens": 555480286.0, + "step": 15241 + }, + { + "epoch": 2.8304549675023214, + "grad_norm": 1.7285466194152832, + "learning_rate": 1e-06, + "loss": 0.2847, + "mean_token_accuracy": 0.8964896202087402, + "num_tokens": 555512830.0, + "step": 15242 + }, + { + "epoch": 2.830640668523677, + "grad_norm": 1.5791714191436768, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8969990015029907, + "num_tokens": 555549766.0, + "step": 15243 + }, + { + "epoch": 2.8308263695450324, + "grad_norm": 1.6001570224761963, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8935006856918335, + "num_tokens": 555588611.0, + "step": 15244 + }, + { + "epoch": 2.831012070566388, + "grad_norm": 1.4647654294967651, + "learning_rate": 1e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.900036096572876, + "num_tokens": 555627449.0, + "step": 15245 + }, + { + "epoch": 2.831197771587744, + "grad_norm": 1.6115206480026245, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8805402517318726, + "num_tokens": 555666730.0, + "step": 15246 + }, + { + "epoch": 2.831383472609099, + "grad_norm": 1.6608123779296875, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8742964863777161, + "num_tokens": 555704430.0, + "step": 15247 + }, + { + "epoch": 2.831569173630455, + "grad_norm": 1.5810340642929077, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8734438419342041, + "num_tokens": 555746861.0, + "step": 15248 + }, + { + "epoch": 2.8317548746518106, + "grad_norm": 1.5795252323150635, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8824909925460815, + "num_tokens": 555784600.0, + "step": 15249 + }, + { + "epoch": 2.8319405756731664, + "grad_norm": 1.5944359302520752, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8855398297309875, + "num_tokens": 555822408.0, + "step": 15250 + }, + { + "epoch": 2.8321262766945217, + "grad_norm": 1.6865732669830322, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8817090392112732, + "num_tokens": 555855428.0, + "step": 15251 + }, + { + "epoch": 2.8323119777158774, + "grad_norm": 1.626940131187439, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8803980350494385, + "num_tokens": 555890911.0, + "step": 15252 + }, + { + "epoch": 2.832497678737233, + "grad_norm": 1.8226834535598755, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8648777008056641, + "num_tokens": 555924920.0, + "step": 15253 + }, + { + "epoch": 2.8326833797585884, + "grad_norm": 1.8129488229751587, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8871926665306091, + "num_tokens": 555956946.0, + "step": 15254 + }, + { + "epoch": 2.832869080779944, + "grad_norm": 1.827681064605713, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.876650869846344, + "num_tokens": 555986989.0, + "step": 15255 + }, + { + "epoch": 2.8330547818013, + "grad_norm": 1.5318901538848877, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8742870092391968, + "num_tokens": 556029514.0, + "step": 15256 + }, + { + "epoch": 2.8332404828226556, + "grad_norm": 1.6972318887710571, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8830520510673523, + "num_tokens": 556061317.0, + "step": 15257 + }, + { + "epoch": 2.8334261838440113, + "grad_norm": 1.6073068380355835, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8839219808578491, + "num_tokens": 556099119.0, + "step": 15258 + }, + { + "epoch": 2.8336118848653666, + "grad_norm": 1.5313597917556763, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8888880014419556, + "num_tokens": 556137568.0, + "step": 15259 + }, + { + "epoch": 2.8337975858867224, + "grad_norm": 1.5288982391357422, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.895445704460144, + "num_tokens": 556172029.0, + "step": 15260 + }, + { + "epoch": 2.833983286908078, + "grad_norm": 1.5778027772903442, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8916946649551392, + "num_tokens": 556206179.0, + "step": 15261 + }, + { + "epoch": 2.8341689879294334, + "grad_norm": 1.4982154369354248, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8886048793792725, + "num_tokens": 556247950.0, + "step": 15262 + }, + { + "epoch": 2.834354688950789, + "grad_norm": 1.6573206186294556, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8945354223251343, + "num_tokens": 556282737.0, + "step": 15263 + }, + { + "epoch": 2.834540389972145, + "grad_norm": 1.6366380453109741, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.894472599029541, + "num_tokens": 556316632.0, + "step": 15264 + }, + { + "epoch": 2.8347260909935006, + "grad_norm": 1.5741938352584839, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8789173364639282, + "num_tokens": 556354839.0, + "step": 15265 + }, + { + "epoch": 2.8349117920148563, + "grad_norm": 1.6555603742599487, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8835754990577698, + "num_tokens": 556389093.0, + "step": 15266 + }, + { + "epoch": 2.8350974930362116, + "grad_norm": 1.572615146636963, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8811564445495605, + "num_tokens": 556429744.0, + "step": 15267 + }, + { + "epoch": 2.8352831940575673, + "grad_norm": 1.7653478384017944, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8868557214736938, + "num_tokens": 556462198.0, + "step": 15268 + }, + { + "epoch": 2.835468895078923, + "grad_norm": 1.5947530269622803, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8827871680259705, + "num_tokens": 556499528.0, + "step": 15269 + }, + { + "epoch": 2.8356545961002784, + "grad_norm": 1.6081104278564453, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8957651257514954, + "num_tokens": 556534904.0, + "step": 15270 + }, + { + "epoch": 2.835840297121634, + "grad_norm": 1.6344125270843506, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8887454271316528, + "num_tokens": 556569648.0, + "step": 15271 + }, + { + "epoch": 2.83602599814299, + "grad_norm": 1.4764262437820435, + "learning_rate": 1e-06, + "loss": 0.2814, + "mean_token_accuracy": 0.9001122117042542, + "num_tokens": 556610163.0, + "step": 15272 + }, + { + "epoch": 2.8362116991643456, + "grad_norm": 1.5528852939605713, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8851832151412964, + "num_tokens": 556651082.0, + "step": 15273 + }, + { + "epoch": 2.836397400185701, + "grad_norm": 1.5806008577346802, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8856365084648132, + "num_tokens": 556689221.0, + "step": 15274 + }, + { + "epoch": 2.8365831012070566, + "grad_norm": 1.8003567457199097, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8758180141448975, + "num_tokens": 556721194.0, + "step": 15275 + }, + { + "epoch": 2.8367688022284123, + "grad_norm": 1.6941381692886353, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8841105699539185, + "num_tokens": 556753576.0, + "step": 15276 + }, + { + "epoch": 2.8369545032497676, + "grad_norm": 1.5935485363006592, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9036417007446289, + "num_tokens": 556790412.0, + "step": 15277 + }, + { + "epoch": 2.8371402042711233, + "grad_norm": 1.6420629024505615, + "learning_rate": 1e-06, + "loss": 0.2804, + "mean_token_accuracy": 0.901594877243042, + "num_tokens": 556822680.0, + "step": 15278 + }, + { + "epoch": 2.837325905292479, + "grad_norm": 1.6519097089767456, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8950565457344055, + "num_tokens": 556856879.0, + "step": 15279 + }, + { + "epoch": 2.837511606313835, + "grad_norm": 1.4936999082565308, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8852709531784058, + "num_tokens": 556892188.0, + "step": 15280 + }, + { + "epoch": 2.8376973073351905, + "grad_norm": 1.4525576829910278, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8905597925186157, + "num_tokens": 556933462.0, + "step": 15281 + }, + { + "epoch": 2.837883008356546, + "grad_norm": 1.6524101495742798, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8878655433654785, + "num_tokens": 556969420.0, + "step": 15282 + }, + { + "epoch": 2.8380687093779016, + "grad_norm": 1.6994848251342773, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8777744770050049, + "num_tokens": 557004361.0, + "step": 15283 + }, + { + "epoch": 2.8382544103992573, + "grad_norm": 1.6098856925964355, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8887287378311157, + "num_tokens": 557039575.0, + "step": 15284 + }, + { + "epoch": 2.8384401114206126, + "grad_norm": 1.6330753564834595, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8843973875045776, + "num_tokens": 557074230.0, + "step": 15285 + }, + { + "epoch": 2.8386258124419683, + "grad_norm": 1.5991730690002441, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8869717121124268, + "num_tokens": 557111022.0, + "step": 15286 + }, + { + "epoch": 2.838811513463324, + "grad_norm": 1.5087977647781372, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8885964155197144, + "num_tokens": 557150457.0, + "step": 15287 + }, + { + "epoch": 2.83899721448468, + "grad_norm": 1.7879743576049805, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8816133737564087, + "num_tokens": 557184760.0, + "step": 15288 + }, + { + "epoch": 2.8391829155060355, + "grad_norm": 1.6544803380966187, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8874707818031311, + "num_tokens": 557221037.0, + "step": 15289 + }, + { + "epoch": 2.839368616527391, + "grad_norm": 1.5983392000198364, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8934001922607422, + "num_tokens": 557254650.0, + "step": 15290 + }, + { + "epoch": 2.8395543175487465, + "grad_norm": 1.6663318872451782, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8746604919433594, + "num_tokens": 557293789.0, + "step": 15291 + }, + { + "epoch": 2.8397400185701023, + "grad_norm": 1.710822343826294, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8745534420013428, + "num_tokens": 557328369.0, + "step": 15292 + }, + { + "epoch": 2.8399257195914576, + "grad_norm": 1.5667513608932495, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8853736519813538, + "num_tokens": 557364976.0, + "step": 15293 + }, + { + "epoch": 2.8401114206128133, + "grad_norm": 1.6000804901123047, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8890043497085571, + "num_tokens": 557401542.0, + "step": 15294 + }, + { + "epoch": 2.840297121634169, + "grad_norm": 1.5774574279785156, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8941330313682556, + "num_tokens": 557436757.0, + "step": 15295 + }, + { + "epoch": 2.8404828226555248, + "grad_norm": 1.5399912595748901, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8841040134429932, + "num_tokens": 557476393.0, + "step": 15296 + }, + { + "epoch": 2.8406685236768805, + "grad_norm": 1.497054100036621, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8918584585189819, + "num_tokens": 557514306.0, + "step": 15297 + }, + { + "epoch": 2.840854224698236, + "grad_norm": 1.600064754486084, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8771442770957947, + "num_tokens": 557553951.0, + "step": 15298 + }, + { + "epoch": 2.8410399257195915, + "grad_norm": 1.7629026174545288, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8892310857772827, + "num_tokens": 557585899.0, + "step": 15299 + }, + { + "epoch": 2.841225626740947, + "grad_norm": 1.5374358892440796, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8754054307937622, + "num_tokens": 557625199.0, + "step": 15300 + }, + { + "epoch": 2.8414113277623025, + "grad_norm": 1.7541433572769165, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8789830207824707, + "num_tokens": 557662507.0, + "step": 15301 + }, + { + "epoch": 2.8415970287836583, + "grad_norm": 1.4665653705596924, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8921681642532349, + "num_tokens": 557705554.0, + "step": 15302 + }, + { + "epoch": 2.841782729805014, + "grad_norm": 1.538225769996643, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8921560049057007, + "num_tokens": 557745559.0, + "step": 15303 + }, + { + "epoch": 2.8419684308263697, + "grad_norm": 1.4555848836898804, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8881803750991821, + "num_tokens": 557786207.0, + "step": 15304 + }, + { + "epoch": 2.842154131847725, + "grad_norm": 1.5620567798614502, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8894391655921936, + "num_tokens": 557825067.0, + "step": 15305 + }, + { + "epoch": 2.8423398328690808, + "grad_norm": 1.6672117710113525, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8791525363922119, + "num_tokens": 557865498.0, + "step": 15306 + }, + { + "epoch": 2.8425255338904365, + "grad_norm": 1.571724534034729, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8940100073814392, + "num_tokens": 557903031.0, + "step": 15307 + }, + { + "epoch": 2.842711234911792, + "grad_norm": 1.6485724449157715, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8899164199829102, + "num_tokens": 557935803.0, + "step": 15308 + }, + { + "epoch": 2.8428969359331475, + "grad_norm": 1.4850056171417236, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8951210379600525, + "num_tokens": 557973989.0, + "step": 15309 + }, + { + "epoch": 2.8430826369545033, + "grad_norm": 1.5514702796936035, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8866215348243713, + "num_tokens": 558015092.0, + "step": 15310 + }, + { + "epoch": 2.843268337975859, + "grad_norm": 1.5454204082489014, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8945673704147339, + "num_tokens": 558050993.0, + "step": 15311 + }, + { + "epoch": 2.8434540389972147, + "grad_norm": 1.569239616394043, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8908935189247131, + "num_tokens": 558086098.0, + "step": 15312 + }, + { + "epoch": 2.84363974001857, + "grad_norm": 1.673568844795227, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8956981897354126, + "num_tokens": 558121989.0, + "step": 15313 + }, + { + "epoch": 2.8438254410399257, + "grad_norm": 1.5768705606460571, + "learning_rate": 1e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.9000258445739746, + "num_tokens": 558155303.0, + "step": 15314 + }, + { + "epoch": 2.8440111420612815, + "grad_norm": 1.563961386680603, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8839868903160095, + "num_tokens": 558196174.0, + "step": 15315 + }, + { + "epoch": 2.8441968430826368, + "grad_norm": 1.7328600883483887, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8812814354896545, + "num_tokens": 558233622.0, + "step": 15316 + }, + { + "epoch": 2.8443825441039925, + "grad_norm": 1.6530894041061401, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8962996602058411, + "num_tokens": 558267185.0, + "step": 15317 + }, + { + "epoch": 2.8445682451253482, + "grad_norm": 1.575513243675232, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8732572793960571, + "num_tokens": 558307603.0, + "step": 15318 + }, + { + "epoch": 2.844753946146704, + "grad_norm": 1.6160649061203003, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8811608552932739, + "num_tokens": 558345965.0, + "step": 15319 + }, + { + "epoch": 2.8449396471680597, + "grad_norm": 1.574609637260437, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8853356242179871, + "num_tokens": 558385581.0, + "step": 15320 + }, + { + "epoch": 2.845125348189415, + "grad_norm": 1.509626030921936, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8958925008773804, + "num_tokens": 558423125.0, + "step": 15321 + }, + { + "epoch": 2.8453110492107707, + "grad_norm": 1.7368478775024414, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8744152188301086, + "num_tokens": 558456862.0, + "step": 15322 + }, + { + "epoch": 2.845496750232126, + "grad_norm": 1.6945241689682007, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8928505182266235, + "num_tokens": 558489639.0, + "step": 15323 + }, + { + "epoch": 2.8456824512534817, + "grad_norm": 1.4599896669387817, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8917317986488342, + "num_tokens": 558531284.0, + "step": 15324 + }, + { + "epoch": 2.8458681522748375, + "grad_norm": 1.4763458967208862, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8774510622024536, + "num_tokens": 558577062.0, + "step": 15325 + }, + { + "epoch": 2.846053853296193, + "grad_norm": 1.5274443626403809, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8892611265182495, + "num_tokens": 558614646.0, + "step": 15326 + }, + { + "epoch": 2.846239554317549, + "grad_norm": 1.5790998935699463, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8903480768203735, + "num_tokens": 558647738.0, + "step": 15327 + }, + { + "epoch": 2.8464252553389042, + "grad_norm": 1.6294372081756592, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.890849232673645, + "num_tokens": 558681451.0, + "step": 15328 + }, + { + "epoch": 2.84661095636026, + "grad_norm": 1.609506368637085, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8950794339179993, + "num_tokens": 558716927.0, + "step": 15329 + }, + { + "epoch": 2.8467966573816157, + "grad_norm": 1.5014307498931885, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8893933296203613, + "num_tokens": 558755652.0, + "step": 15330 + }, + { + "epoch": 2.846982358402971, + "grad_norm": 1.7322332859039307, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.884395956993103, + "num_tokens": 558788307.0, + "step": 15331 + }, + { + "epoch": 2.8471680594243267, + "grad_norm": 1.4812946319580078, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8833805322647095, + "num_tokens": 558832089.0, + "step": 15332 + }, + { + "epoch": 2.8473537604456824, + "grad_norm": 1.739137053489685, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8772035837173462, + "num_tokens": 558866177.0, + "step": 15333 + }, + { + "epoch": 2.847539461467038, + "grad_norm": 1.71392822265625, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8843463659286499, + "num_tokens": 558900812.0, + "step": 15334 + }, + { + "epoch": 2.847725162488394, + "grad_norm": 1.5761523246765137, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8821899890899658, + "num_tokens": 558937174.0, + "step": 15335 + }, + { + "epoch": 2.847910863509749, + "grad_norm": 1.5671873092651367, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8665821552276611, + "num_tokens": 558976772.0, + "step": 15336 + }, + { + "epoch": 2.848096564531105, + "grad_norm": 1.595810055732727, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8896095752716064, + "num_tokens": 559009013.0, + "step": 15337 + }, + { + "epoch": 2.8482822655524607, + "grad_norm": 1.4383370876312256, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8946284055709839, + "num_tokens": 559049164.0, + "step": 15338 + }, + { + "epoch": 2.848467966573816, + "grad_norm": 1.5896867513656616, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8874495029449463, + "num_tokens": 559084911.0, + "step": 15339 + }, + { + "epoch": 2.8486536675951717, + "grad_norm": 1.557114839553833, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8837271928787231, + "num_tokens": 559126891.0, + "step": 15340 + }, + { + "epoch": 2.8488393686165274, + "grad_norm": 1.4553487300872803, + "learning_rate": 1e-06, + "loss": 0.2661, + "mean_token_accuracy": 0.9001076221466064, + "num_tokens": 559167296.0, + "step": 15341 + }, + { + "epoch": 2.849025069637883, + "grad_norm": 1.7999584674835205, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8665157556533813, + "num_tokens": 559202675.0, + "step": 15342 + }, + { + "epoch": 2.849210770659239, + "grad_norm": 1.585157871246338, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.88997483253479, + "num_tokens": 559239078.0, + "step": 15343 + }, + { + "epoch": 2.849396471680594, + "grad_norm": 1.4989787340164185, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8815851211547852, + "num_tokens": 559281007.0, + "step": 15344 + }, + { + "epoch": 2.84958217270195, + "grad_norm": 1.4781017303466797, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8824283480644226, + "num_tokens": 559322992.0, + "step": 15345 + }, + { + "epoch": 2.8497678737233056, + "grad_norm": 1.5150099992752075, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8877177238464355, + "num_tokens": 559363125.0, + "step": 15346 + }, + { + "epoch": 2.849953574744661, + "grad_norm": 1.6342358589172363, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8921236991882324, + "num_tokens": 559399245.0, + "step": 15347 + }, + { + "epoch": 2.8501392757660167, + "grad_norm": 1.8579061031341553, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8798247575759888, + "num_tokens": 559429146.0, + "step": 15348 + }, + { + "epoch": 2.8503249767873724, + "grad_norm": 1.6872389316558838, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8930776715278625, + "num_tokens": 559461116.0, + "step": 15349 + }, + { + "epoch": 2.850510677808728, + "grad_norm": 1.742245078086853, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8875225782394409, + "num_tokens": 559494802.0, + "step": 15350 + }, + { + "epoch": 2.8506963788300834, + "grad_norm": 1.6821926832199097, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8881303071975708, + "num_tokens": 559530576.0, + "step": 15351 + }, + { + "epoch": 2.850882079851439, + "grad_norm": 1.7485591173171997, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.886894702911377, + "num_tokens": 559562292.0, + "step": 15352 + }, + { + "epoch": 2.851067780872795, + "grad_norm": 1.6595510244369507, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8740516304969788, + "num_tokens": 559599356.0, + "step": 15353 + }, + { + "epoch": 2.85125348189415, + "grad_norm": 1.5239973068237305, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8836673498153687, + "num_tokens": 559638330.0, + "step": 15354 + }, + { + "epoch": 2.851439182915506, + "grad_norm": 1.7314026355743408, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8897876739501953, + "num_tokens": 559668934.0, + "step": 15355 + }, + { + "epoch": 2.8516248839368616, + "grad_norm": 1.8842178583145142, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8768712282180786, + "num_tokens": 559697897.0, + "step": 15356 + }, + { + "epoch": 2.8518105849582174, + "grad_norm": 1.654021978378296, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8780763149261475, + "num_tokens": 559737018.0, + "step": 15357 + }, + { + "epoch": 2.851996285979573, + "grad_norm": 1.7681612968444824, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8762859106063843, + "num_tokens": 559772635.0, + "step": 15358 + }, + { + "epoch": 2.8521819870009284, + "grad_norm": 1.4829598665237427, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8920071721076965, + "num_tokens": 559814474.0, + "step": 15359 + }, + { + "epoch": 2.852367688022284, + "grad_norm": 1.5597355365753174, + "learning_rate": 1e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.898659348487854, + "num_tokens": 559850686.0, + "step": 15360 + }, + { + "epoch": 2.85255338904364, + "grad_norm": 1.6328628063201904, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8932361006736755, + "num_tokens": 559884671.0, + "step": 15361 + }, + { + "epoch": 2.852739090064995, + "grad_norm": 1.5502290725708008, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8823277950286865, + "num_tokens": 559922845.0, + "step": 15362 + }, + { + "epoch": 2.852924791086351, + "grad_norm": 1.5744216442108154, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8814160227775574, + "num_tokens": 559961518.0, + "step": 15363 + }, + { + "epoch": 2.8531104921077066, + "grad_norm": 1.5975236892700195, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8883920907974243, + "num_tokens": 559999392.0, + "step": 15364 + }, + { + "epoch": 2.8532961931290624, + "grad_norm": 1.6432842016220093, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8852739334106445, + "num_tokens": 560033260.0, + "step": 15365 + }, + { + "epoch": 2.853481894150418, + "grad_norm": 1.5249921083450317, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8969860076904297, + "num_tokens": 560068400.0, + "step": 15366 + }, + { + "epoch": 2.8536675951717734, + "grad_norm": 1.8019065856933594, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8780378103256226, + "num_tokens": 560100056.0, + "step": 15367 + }, + { + "epoch": 2.853853296193129, + "grad_norm": 1.5625159740447998, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8766278028488159, + "num_tokens": 560140842.0, + "step": 15368 + }, + { + "epoch": 2.854038997214485, + "grad_norm": 1.568026065826416, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8843444585800171, + "num_tokens": 560185439.0, + "step": 15369 + }, + { + "epoch": 2.85422469823584, + "grad_norm": 1.6973450183868408, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8826804161071777, + "num_tokens": 560223684.0, + "step": 15370 + }, + { + "epoch": 2.854410399257196, + "grad_norm": 1.5960779190063477, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8881056308746338, + "num_tokens": 560261870.0, + "step": 15371 + }, + { + "epoch": 2.8545961002785516, + "grad_norm": 1.5899930000305176, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8957540988922119, + "num_tokens": 560299033.0, + "step": 15372 + }, + { + "epoch": 2.8547818012999073, + "grad_norm": 1.6266790628433228, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.8988282680511475, + "num_tokens": 560331914.0, + "step": 15373 + }, + { + "epoch": 2.8549675023212626, + "grad_norm": 1.5787272453308105, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.887795090675354, + "num_tokens": 560371988.0, + "step": 15374 + }, + { + "epoch": 2.8551532033426184, + "grad_norm": 1.759249210357666, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8907023668289185, + "num_tokens": 560406981.0, + "step": 15375 + }, + { + "epoch": 2.855338904363974, + "grad_norm": 1.6755214929580688, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8813931941986084, + "num_tokens": 560444061.0, + "step": 15376 + }, + { + "epoch": 2.8555246053853294, + "grad_norm": 1.637368083000183, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8964905738830566, + "num_tokens": 560477561.0, + "step": 15377 + }, + { + "epoch": 2.855710306406685, + "grad_norm": 1.6567027568817139, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8935711979866028, + "num_tokens": 560511684.0, + "step": 15378 + }, + { + "epoch": 2.855896007428041, + "grad_norm": 1.7509819269180298, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8935738801956177, + "num_tokens": 560547239.0, + "step": 15379 + }, + { + "epoch": 2.8560817084493966, + "grad_norm": 1.6778737306594849, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.8972978591918945, + "num_tokens": 560576841.0, + "step": 15380 + }, + { + "epoch": 2.8562674094707523, + "grad_norm": 1.5208131074905396, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8932112455368042, + "num_tokens": 560615792.0, + "step": 15381 + }, + { + "epoch": 2.8564531104921076, + "grad_norm": 1.6133307218551636, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8936092853546143, + "num_tokens": 560653984.0, + "step": 15382 + }, + { + "epoch": 2.8566388115134633, + "grad_norm": 1.735988736152649, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8761835098266602, + "num_tokens": 560691239.0, + "step": 15383 + }, + { + "epoch": 2.856824512534819, + "grad_norm": 1.744621753692627, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8920910954475403, + "num_tokens": 560723377.0, + "step": 15384 + }, + { + "epoch": 2.8570102135561743, + "grad_norm": 1.6159635782241821, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8904553055763245, + "num_tokens": 560764104.0, + "step": 15385 + }, + { + "epoch": 2.85719591457753, + "grad_norm": 1.615877628326416, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8760902285575867, + "num_tokens": 560803051.0, + "step": 15386 + }, + { + "epoch": 2.857381615598886, + "grad_norm": 1.3518261909484863, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8867291212081909, + "num_tokens": 560849158.0, + "step": 15387 + }, + { + "epoch": 2.8575673166202415, + "grad_norm": 1.696196436882019, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8819153308868408, + "num_tokens": 560886197.0, + "step": 15388 + }, + { + "epoch": 2.8577530176415973, + "grad_norm": 1.7485450506210327, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8919119238853455, + "num_tokens": 560921165.0, + "step": 15389 + }, + { + "epoch": 2.8579387186629526, + "grad_norm": 1.6156266927719116, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8899503946304321, + "num_tokens": 560958181.0, + "step": 15390 + }, + { + "epoch": 2.8581244196843083, + "grad_norm": 1.528241753578186, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8979032039642334, + "num_tokens": 560996466.0, + "step": 15391 + }, + { + "epoch": 2.858310120705664, + "grad_norm": 1.5292507410049438, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8931358456611633, + "num_tokens": 561030303.0, + "step": 15392 + }, + { + "epoch": 2.8584958217270193, + "grad_norm": 1.4880543947219849, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8774175047874451, + "num_tokens": 561071907.0, + "step": 15393 + }, + { + "epoch": 2.858681522748375, + "grad_norm": 1.6740484237670898, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8860338926315308, + "num_tokens": 561107456.0, + "step": 15394 + }, + { + "epoch": 2.858867223769731, + "grad_norm": 1.6202419996261597, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8946565389633179, + "num_tokens": 561142178.0, + "step": 15395 + }, + { + "epoch": 2.8590529247910865, + "grad_norm": 1.525847315788269, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8803671002388, + "num_tokens": 561181253.0, + "step": 15396 + }, + { + "epoch": 2.859238625812442, + "grad_norm": 1.4023921489715576, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8960056900978088, + "num_tokens": 561224809.0, + "step": 15397 + }, + { + "epoch": 2.8594243268337975, + "grad_norm": 1.5450812578201294, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.884734034538269, + "num_tokens": 561266508.0, + "step": 15398 + }, + { + "epoch": 2.8596100278551533, + "grad_norm": 1.8454318046569824, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.876875638961792, + "num_tokens": 561297375.0, + "step": 15399 + }, + { + "epoch": 2.8597957288765086, + "grad_norm": 1.9922393560409546, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8884704113006592, + "num_tokens": 561330739.0, + "step": 15400 + }, + { + "epoch": 2.8599814298978643, + "grad_norm": 1.8306964635849, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8843814134597778, + "num_tokens": 561361019.0, + "step": 15401 + }, + { + "epoch": 2.86016713091922, + "grad_norm": 1.4640744924545288, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8837121725082397, + "num_tokens": 561407532.0, + "step": 15402 + }, + { + "epoch": 2.8603528319405758, + "grad_norm": 1.5978089570999146, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8892359733581543, + "num_tokens": 561442931.0, + "step": 15403 + }, + { + "epoch": 2.8605385329619315, + "grad_norm": 1.6855584383010864, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8655972480773926, + "num_tokens": 561481008.0, + "step": 15404 + }, + { + "epoch": 2.860724233983287, + "grad_norm": 1.5101900100708008, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8837864995002747, + "num_tokens": 561522579.0, + "step": 15405 + }, + { + "epoch": 2.8609099350046425, + "grad_norm": 1.5800449848175049, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8902069330215454, + "num_tokens": 561561487.0, + "step": 15406 + }, + { + "epoch": 2.8610956360259983, + "grad_norm": 1.897953987121582, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8569763898849487, + "num_tokens": 561594499.0, + "step": 15407 + }, + { + "epoch": 2.8612813370473535, + "grad_norm": 1.7533725500106812, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8840339183807373, + "num_tokens": 561626402.0, + "step": 15408 + }, + { + "epoch": 2.8614670380687093, + "grad_norm": 1.6588138341903687, + "learning_rate": 1e-06, + "loss": 0.2777, + "mean_token_accuracy": 0.8981099724769592, + "num_tokens": 561657561.0, + "step": 15409 + }, + { + "epoch": 2.861652739090065, + "grad_norm": 1.6481823921203613, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8802441358566284, + "num_tokens": 561694321.0, + "step": 15410 + }, + { + "epoch": 2.8618384401114207, + "grad_norm": 1.650731086730957, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8924699425697327, + "num_tokens": 561728250.0, + "step": 15411 + }, + { + "epoch": 2.8620241411327765, + "grad_norm": 1.5497031211853027, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8853919506072998, + "num_tokens": 561765542.0, + "step": 15412 + }, + { + "epoch": 2.8622098421541318, + "grad_norm": 1.635229229927063, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8885362148284912, + "num_tokens": 561800887.0, + "step": 15413 + }, + { + "epoch": 2.8623955431754875, + "grad_norm": 1.7199615240097046, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8866130113601685, + "num_tokens": 561833740.0, + "step": 15414 + }, + { + "epoch": 2.8625812441968432, + "grad_norm": 1.5488330125808716, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8878527283668518, + "num_tokens": 561870551.0, + "step": 15415 + }, + { + "epoch": 2.8627669452181985, + "grad_norm": 1.4319833517074585, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8894537091255188, + "num_tokens": 561911887.0, + "step": 15416 + }, + { + "epoch": 2.8629526462395543, + "grad_norm": 1.573388934135437, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8941764831542969, + "num_tokens": 561948418.0, + "step": 15417 + }, + { + "epoch": 2.86313834726091, + "grad_norm": 1.488692283630371, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8928428888320923, + "num_tokens": 561989763.0, + "step": 15418 + }, + { + "epoch": 2.8633240482822657, + "grad_norm": 1.5540908575057983, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8930180668830872, + "num_tokens": 562023206.0, + "step": 15419 + }, + { + "epoch": 2.863509749303621, + "grad_norm": 1.8879307508468628, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8731741309165955, + "num_tokens": 562055976.0, + "step": 15420 + }, + { + "epoch": 2.8636954503249767, + "grad_norm": 1.692943811416626, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8720670342445374, + "num_tokens": 562091289.0, + "step": 15421 + }, + { + "epoch": 2.8638811513463325, + "grad_norm": 1.7035623788833618, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8962123394012451, + "num_tokens": 562122534.0, + "step": 15422 + }, + { + "epoch": 2.8640668523676878, + "grad_norm": 1.5700538158416748, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8902319669723511, + "num_tokens": 562161229.0, + "step": 15423 + }, + { + "epoch": 2.8642525533890435, + "grad_norm": 1.7158712148666382, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8811944127082825, + "num_tokens": 562192992.0, + "step": 15424 + }, + { + "epoch": 2.8644382544103992, + "grad_norm": 1.5279914140701294, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8887357115745544, + "num_tokens": 562236245.0, + "step": 15425 + }, + { + "epoch": 2.864623955431755, + "grad_norm": 1.602410912513733, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8933929204940796, + "num_tokens": 562269666.0, + "step": 15426 + }, + { + "epoch": 2.8648096564531107, + "grad_norm": 1.9393495321273804, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8809844255447388, + "num_tokens": 562296716.0, + "step": 15427 + }, + { + "epoch": 2.864995357474466, + "grad_norm": 1.4926587343215942, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8822993636131287, + "num_tokens": 562337566.0, + "step": 15428 + }, + { + "epoch": 2.8651810584958217, + "grad_norm": 1.5566970109939575, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8924288749694824, + "num_tokens": 562377206.0, + "step": 15429 + }, + { + "epoch": 2.8653667595171775, + "grad_norm": 1.6922935247421265, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8759084343910217, + "num_tokens": 562412815.0, + "step": 15430 + }, + { + "epoch": 2.8655524605385327, + "grad_norm": 1.7338942289352417, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8775482177734375, + "num_tokens": 562444506.0, + "step": 15431 + }, + { + "epoch": 2.8657381615598885, + "grad_norm": 1.514288067817688, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8836998343467712, + "num_tokens": 562481888.0, + "step": 15432 + }, + { + "epoch": 2.865923862581244, + "grad_norm": 1.535129189491272, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8931276202201843, + "num_tokens": 562518697.0, + "step": 15433 + }, + { + "epoch": 2.8661095636026, + "grad_norm": 1.6916857957839966, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.890893280506134, + "num_tokens": 562550885.0, + "step": 15434 + }, + { + "epoch": 2.8662952646239557, + "grad_norm": 1.8103117942810059, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8748239278793335, + "num_tokens": 562583270.0, + "step": 15435 + }, + { + "epoch": 2.866480965645311, + "grad_norm": 1.748584508895874, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8819653987884521, + "num_tokens": 562616700.0, + "step": 15436 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 1.5830810070037842, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8834350109100342, + "num_tokens": 562656930.0, + "step": 15437 + }, + { + "epoch": 2.8668523676880224, + "grad_norm": 1.4782127141952515, + "learning_rate": 1e-06, + "loss": 0.264, + "mean_token_accuracy": 0.9022874236106873, + "num_tokens": 562693219.0, + "step": 15438 + }, + { + "epoch": 2.8670380687093777, + "grad_norm": 1.5654575824737549, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8827060461044312, + "num_tokens": 562732195.0, + "step": 15439 + }, + { + "epoch": 2.8672237697307335, + "grad_norm": 1.5313938856124878, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8783756494522095, + "num_tokens": 562775618.0, + "step": 15440 + }, + { + "epoch": 2.867409470752089, + "grad_norm": 1.6626205444335938, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8796718120574951, + "num_tokens": 562810282.0, + "step": 15441 + }, + { + "epoch": 2.867595171773445, + "grad_norm": 1.7964216470718384, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8682056069374084, + "num_tokens": 562843941.0, + "step": 15442 + }, + { + "epoch": 2.8677808727948, + "grad_norm": 1.553436040878296, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8941386342048645, + "num_tokens": 562879786.0, + "step": 15443 + }, + { + "epoch": 2.867966573816156, + "grad_norm": 1.482089877128601, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8932759165763855, + "num_tokens": 562919138.0, + "step": 15444 + }, + { + "epoch": 2.8681522748375117, + "grad_norm": 1.6045273542404175, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8897665143013, + "num_tokens": 562955608.0, + "step": 15445 + }, + { + "epoch": 2.868337975858867, + "grad_norm": 1.7446937561035156, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8926000595092773, + "num_tokens": 562987801.0, + "step": 15446 + }, + { + "epoch": 2.8685236768802227, + "grad_norm": 1.637514352798462, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8947001099586487, + "num_tokens": 563025936.0, + "step": 15447 + }, + { + "epoch": 2.8687093779015784, + "grad_norm": 1.6862658262252808, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8909543752670288, + "num_tokens": 563058644.0, + "step": 15448 + }, + { + "epoch": 2.868895078922934, + "grad_norm": 1.6064136028289795, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8855887055397034, + "num_tokens": 563098181.0, + "step": 15449 + }, + { + "epoch": 2.86908077994429, + "grad_norm": 1.569589376449585, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8939917683601379, + "num_tokens": 563133914.0, + "step": 15450 + }, + { + "epoch": 2.869266480965645, + "grad_norm": 1.5316442251205444, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8954199552536011, + "num_tokens": 563171095.0, + "step": 15451 + }, + { + "epoch": 2.869452181987001, + "grad_norm": 1.5778950452804565, + "learning_rate": 1e-06, + "loss": 0.2774, + "mean_token_accuracy": 0.8969466686248779, + "num_tokens": 563205327.0, + "step": 15452 + }, + { + "epoch": 2.8696378830083566, + "grad_norm": 1.620479702949524, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8836690187454224, + "num_tokens": 563244285.0, + "step": 15453 + }, + { + "epoch": 2.869823584029712, + "grad_norm": 1.6623473167419434, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8823535442352295, + "num_tokens": 563281365.0, + "step": 15454 + }, + { + "epoch": 2.8700092850510677, + "grad_norm": 1.6473630666732788, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8806067109107971, + "num_tokens": 563318521.0, + "step": 15455 + }, + { + "epoch": 2.8701949860724234, + "grad_norm": 1.5141081809997559, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8834002017974854, + "num_tokens": 563359680.0, + "step": 15456 + }, + { + "epoch": 2.870380687093779, + "grad_norm": 1.7708739042282104, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8814203143119812, + "num_tokens": 563390737.0, + "step": 15457 + }, + { + "epoch": 2.870566388115135, + "grad_norm": 1.5810534954071045, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8817987442016602, + "num_tokens": 563430316.0, + "step": 15458 + }, + { + "epoch": 2.87075208913649, + "grad_norm": 1.6615968942642212, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8852676153182983, + "num_tokens": 563464625.0, + "step": 15459 + }, + { + "epoch": 2.870937790157846, + "grad_norm": 1.7473758459091187, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8809136152267456, + "num_tokens": 563500204.0, + "step": 15460 + }, + { + "epoch": 2.8711234911792016, + "grad_norm": 1.7550990581512451, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8890504837036133, + "num_tokens": 563535024.0, + "step": 15461 + }, + { + "epoch": 2.871309192200557, + "grad_norm": 1.5053868293762207, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8918040990829468, + "num_tokens": 563574454.0, + "step": 15462 + }, + { + "epoch": 2.8714948932219126, + "grad_norm": 1.6216810941696167, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8700354099273682, + "num_tokens": 563613882.0, + "step": 15463 + }, + { + "epoch": 2.8716805942432684, + "grad_norm": 1.7439253330230713, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8790851831436157, + "num_tokens": 563646867.0, + "step": 15464 + }, + { + "epoch": 2.871866295264624, + "grad_norm": 1.5448949337005615, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8926064372062683, + "num_tokens": 563684765.0, + "step": 15465 + }, + { + "epoch": 2.87205199628598, + "grad_norm": 1.5627933740615845, + "learning_rate": 1e-06, + "loss": 0.2428, + "mean_token_accuracy": 0.9110937714576721, + "num_tokens": 563716903.0, + "step": 15466 + }, + { + "epoch": 2.872237697307335, + "grad_norm": 1.5268069505691528, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.89265376329422, + "num_tokens": 563756445.0, + "step": 15467 + }, + { + "epoch": 2.872423398328691, + "grad_norm": 1.5523157119750977, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8870829939842224, + "num_tokens": 563796500.0, + "step": 15468 + }, + { + "epoch": 2.872609099350046, + "grad_norm": 1.394455909729004, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8907685875892639, + "num_tokens": 563841749.0, + "step": 15469 + }, + { + "epoch": 2.872794800371402, + "grad_norm": 1.6637624502182007, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8971869945526123, + "num_tokens": 563873115.0, + "step": 15470 + }, + { + "epoch": 2.8729805013927576, + "grad_norm": 1.5874940156936646, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8921079635620117, + "num_tokens": 563906754.0, + "step": 15471 + }, + { + "epoch": 2.8731662024141134, + "grad_norm": 1.6009968519210815, + "learning_rate": 1e-06, + "loss": 0.2688, + "mean_token_accuracy": 0.9026427865028381, + "num_tokens": 563939685.0, + "step": 15472 + }, + { + "epoch": 2.873351903435469, + "grad_norm": 1.473638653755188, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8923426866531372, + "num_tokens": 563978711.0, + "step": 15473 + }, + { + "epoch": 2.8735376044568244, + "grad_norm": 1.6206146478652954, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8907809257507324, + "num_tokens": 564013106.0, + "step": 15474 + }, + { + "epoch": 2.87372330547818, + "grad_norm": 1.51633620262146, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8895223736763, + "num_tokens": 564054344.0, + "step": 15475 + }, + { + "epoch": 2.873909006499536, + "grad_norm": 1.819167971611023, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.8959345817565918, + "num_tokens": 564084756.0, + "step": 15476 + }, + { + "epoch": 2.874094707520891, + "grad_norm": 1.890708565711975, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8792407512664795, + "num_tokens": 564117661.0, + "step": 15477 + }, + { + "epoch": 2.874280408542247, + "grad_norm": 1.6077357530593872, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8794417381286621, + "num_tokens": 564156990.0, + "step": 15478 + }, + { + "epoch": 2.8744661095636026, + "grad_norm": 1.6445772647857666, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8711615204811096, + "num_tokens": 564195533.0, + "step": 15479 + }, + { + "epoch": 2.8746518105849583, + "grad_norm": 1.5769908428192139, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8779441118240356, + "num_tokens": 564237314.0, + "step": 15480 + }, + { + "epoch": 2.874837511606314, + "grad_norm": 1.4647412300109863, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8901488780975342, + "num_tokens": 564278256.0, + "step": 15481 + }, + { + "epoch": 2.8750232126276694, + "grad_norm": 1.5997968912124634, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8909180164337158, + "num_tokens": 564314174.0, + "step": 15482 + }, + { + "epoch": 2.875208913649025, + "grad_norm": 1.5225229263305664, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8889866471290588, + "num_tokens": 564352900.0, + "step": 15483 + }, + { + "epoch": 2.875394614670381, + "grad_norm": 1.5899860858917236, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8791307806968689, + "num_tokens": 564391488.0, + "step": 15484 + }, + { + "epoch": 2.875580315691736, + "grad_norm": 1.4121822118759155, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8890782594680786, + "num_tokens": 564435930.0, + "step": 15485 + }, + { + "epoch": 2.875766016713092, + "grad_norm": 1.6786630153656006, + "learning_rate": 1e-06, + "loss": 0.2741, + "mean_token_accuracy": 0.9014447927474976, + "num_tokens": 564470590.0, + "step": 15486 + }, + { + "epoch": 2.8759517177344476, + "grad_norm": 1.556054949760437, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8916910886764526, + "num_tokens": 564508843.0, + "step": 15487 + }, + { + "epoch": 2.8761374187558033, + "grad_norm": 1.5278021097183228, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8946750164031982, + "num_tokens": 564546197.0, + "step": 15488 + }, + { + "epoch": 2.876323119777159, + "grad_norm": 1.6011581420898438, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8933374881744385, + "num_tokens": 564586508.0, + "step": 15489 + }, + { + "epoch": 2.8765088207985143, + "grad_norm": 1.6969163417816162, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8868772387504578, + "num_tokens": 564618465.0, + "step": 15490 + }, + { + "epoch": 2.87669452181987, + "grad_norm": 1.5744081735610962, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8900662660598755, + "num_tokens": 564654343.0, + "step": 15491 + }, + { + "epoch": 2.8768802228412254, + "grad_norm": 1.7103261947631836, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8940637111663818, + "num_tokens": 564687509.0, + "step": 15492 + }, + { + "epoch": 2.877065923862581, + "grad_norm": 1.6280205249786377, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8880202770233154, + "num_tokens": 564724050.0, + "step": 15493 + }, + { + "epoch": 2.877251624883937, + "grad_norm": 1.5123720169067383, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8819005489349365, + "num_tokens": 564764846.0, + "step": 15494 + }, + { + "epoch": 2.8774373259052926, + "grad_norm": 1.4956504106521606, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8940459489822388, + "num_tokens": 564802878.0, + "step": 15495 + }, + { + "epoch": 2.8776230269266483, + "grad_norm": 1.5387169122695923, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8842204809188843, + "num_tokens": 564845025.0, + "step": 15496 + }, + { + "epoch": 2.8778087279480036, + "grad_norm": 1.6855385303497314, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8784131407737732, + "num_tokens": 564882597.0, + "step": 15497 + }, + { + "epoch": 2.8779944289693593, + "grad_norm": 1.8615531921386719, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8823223114013672, + "num_tokens": 564912861.0, + "step": 15498 + }, + { + "epoch": 2.878180129990715, + "grad_norm": 1.5987153053283691, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8901776075363159, + "num_tokens": 564949179.0, + "step": 15499 + }, + { + "epoch": 2.8783658310120703, + "grad_norm": 1.729931354522705, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8722811937332153, + "num_tokens": 564984086.0, + "step": 15500 + }, + { + "epoch": 2.878551532033426, + "grad_norm": 1.5644795894622803, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8906733989715576, + "num_tokens": 565021868.0, + "step": 15501 + }, + { + "epoch": 2.878737233054782, + "grad_norm": 1.6027274131774902, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8864402174949646, + "num_tokens": 565056919.0, + "step": 15502 + }, + { + "epoch": 2.8789229340761375, + "grad_norm": 1.5748637914657593, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8775283098220825, + "num_tokens": 565095563.0, + "step": 15503 + }, + { + "epoch": 2.8791086350974933, + "grad_norm": 1.5446146726608276, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8835988640785217, + "num_tokens": 565135346.0, + "step": 15504 + }, + { + "epoch": 2.8792943361188486, + "grad_norm": 1.6580302715301514, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8831275105476379, + "num_tokens": 565169226.0, + "step": 15505 + }, + { + "epoch": 2.8794800371402043, + "grad_norm": 1.6535413265228271, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8851625323295593, + "num_tokens": 565203750.0, + "step": 15506 + }, + { + "epoch": 2.87966573816156, + "grad_norm": 1.555503487586975, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.886569082736969, + "num_tokens": 565244156.0, + "step": 15507 + }, + { + "epoch": 2.8798514391829153, + "grad_norm": 1.553261160850525, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8832119703292847, + "num_tokens": 565283579.0, + "step": 15508 + }, + { + "epoch": 2.880037140204271, + "grad_norm": 1.742356300354004, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8926551342010498, + "num_tokens": 565315480.0, + "step": 15509 + }, + { + "epoch": 2.8802228412256268, + "grad_norm": 1.708540916442871, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.892572283744812, + "num_tokens": 565351909.0, + "step": 15510 + }, + { + "epoch": 2.8804085422469825, + "grad_norm": 1.5440294742584229, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8799634575843811, + "num_tokens": 565389271.0, + "step": 15511 + }, + { + "epoch": 2.8805942432683382, + "grad_norm": 1.659114122390747, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.873656153678894, + "num_tokens": 565424391.0, + "step": 15512 + }, + { + "epoch": 2.8807799442896935, + "grad_norm": 1.50917649269104, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8889538645744324, + "num_tokens": 565464097.0, + "step": 15513 + }, + { + "epoch": 2.8809656453110493, + "grad_norm": 1.6160832643508911, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8862105011940002, + "num_tokens": 565500529.0, + "step": 15514 + }, + { + "epoch": 2.881151346332405, + "grad_norm": 1.554492473602295, + "learning_rate": 1e-06, + "loss": 0.27, + "mean_token_accuracy": 0.8999620079994202, + "num_tokens": 565537851.0, + "step": 15515 + }, + { + "epoch": 2.8813370473537603, + "grad_norm": 1.679978847503662, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8753421902656555, + "num_tokens": 565571337.0, + "step": 15516 + }, + { + "epoch": 2.881522748375116, + "grad_norm": 1.6644601821899414, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.884118914604187, + "num_tokens": 565606843.0, + "step": 15517 + }, + { + "epoch": 2.8817084493964717, + "grad_norm": 1.6382200717926025, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8863599300384521, + "num_tokens": 565646809.0, + "step": 15518 + }, + { + "epoch": 2.8818941504178275, + "grad_norm": 1.6612956523895264, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8830754160881042, + "num_tokens": 565681317.0, + "step": 15519 + }, + { + "epoch": 2.8820798514391828, + "grad_norm": 1.601080060005188, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8900783061981201, + "num_tokens": 565718541.0, + "step": 15520 + }, + { + "epoch": 2.8822655524605385, + "grad_norm": 1.6670846939086914, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8858424425125122, + "num_tokens": 565752406.0, + "step": 15521 + }, + { + "epoch": 2.8824512534818942, + "grad_norm": 1.5986348390579224, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.883470892906189, + "num_tokens": 565789291.0, + "step": 15522 + }, + { + "epoch": 2.8826369545032495, + "grad_norm": 1.7396152019500732, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8852119445800781, + "num_tokens": 565822096.0, + "step": 15523 + }, + { + "epoch": 2.8828226555246053, + "grad_norm": 1.6530635356903076, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8719296455383301, + "num_tokens": 565861264.0, + "step": 15524 + }, + { + "epoch": 2.883008356545961, + "grad_norm": 1.5749808549880981, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8764894604682922, + "num_tokens": 565901441.0, + "step": 15525 + }, + { + "epoch": 2.8831940575673167, + "grad_norm": 1.637808918952942, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8846190571784973, + "num_tokens": 565939457.0, + "step": 15526 + }, + { + "epoch": 2.8833797585886725, + "grad_norm": 1.6067167520523071, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.895343542098999, + "num_tokens": 565974476.0, + "step": 15527 + }, + { + "epoch": 2.8835654596100277, + "grad_norm": 1.5892175436019897, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8921523094177246, + "num_tokens": 566008181.0, + "step": 15528 + }, + { + "epoch": 2.8837511606313835, + "grad_norm": 1.5150525569915771, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8894879221916199, + "num_tokens": 566047780.0, + "step": 15529 + }, + { + "epoch": 2.883936861652739, + "grad_norm": 1.6320559978485107, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8709140419960022, + "num_tokens": 566088184.0, + "step": 15530 + }, + { + "epoch": 2.8841225626740945, + "grad_norm": 1.8313943147659302, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8543024659156799, + "num_tokens": 566123947.0, + "step": 15531 + }, + { + "epoch": 2.8843082636954502, + "grad_norm": 1.63690984249115, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8933647871017456, + "num_tokens": 566159292.0, + "step": 15532 + }, + { + "epoch": 2.884493964716806, + "grad_norm": 1.6464564800262451, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8845297694206238, + "num_tokens": 566194156.0, + "step": 15533 + }, + { + "epoch": 2.8846796657381617, + "grad_norm": 1.5457878112792969, + "learning_rate": 1e-06, + "loss": 0.2621, + "mean_token_accuracy": 0.9021027088165283, + "num_tokens": 566227334.0, + "step": 15534 + }, + { + "epoch": 2.8848653667595174, + "grad_norm": 1.6309611797332764, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8773976564407349, + "num_tokens": 566261257.0, + "step": 15535 + }, + { + "epoch": 2.8850510677808727, + "grad_norm": 1.526333212852478, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.871575117111206, + "num_tokens": 566303304.0, + "step": 15536 + }, + { + "epoch": 2.8852367688022285, + "grad_norm": 1.574464201927185, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8878949880599976, + "num_tokens": 566342921.0, + "step": 15537 + }, + { + "epoch": 2.885422469823584, + "grad_norm": 1.655718445777893, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.9014365673065186, + "num_tokens": 566377411.0, + "step": 15538 + }, + { + "epoch": 2.8856081708449395, + "grad_norm": 1.4699276685714722, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8853874206542969, + "num_tokens": 566423447.0, + "step": 15539 + }, + { + "epoch": 2.885793871866295, + "grad_norm": 1.6528511047363281, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8805332183837891, + "num_tokens": 566457523.0, + "step": 15540 + }, + { + "epoch": 2.885979572887651, + "grad_norm": 1.5370712280273438, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8828837275505066, + "num_tokens": 566494104.0, + "step": 15541 + }, + { + "epoch": 2.8861652739090067, + "grad_norm": 1.5761045217514038, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8835583329200745, + "num_tokens": 566532049.0, + "step": 15542 + }, + { + "epoch": 2.886350974930362, + "grad_norm": 1.5444369316101074, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.8995519280433655, + "num_tokens": 566567483.0, + "step": 15543 + }, + { + "epoch": 2.8865366759517177, + "grad_norm": 1.69037663936615, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8701069951057434, + "num_tokens": 566602997.0, + "step": 15544 + }, + { + "epoch": 2.8867223769730734, + "grad_norm": 1.552147626876831, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.8997707962989807, + "num_tokens": 566639802.0, + "step": 15545 + }, + { + "epoch": 2.8869080779944287, + "grad_norm": 1.5765613317489624, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8926784992218018, + "num_tokens": 566673545.0, + "step": 15546 + }, + { + "epoch": 2.8870937790157845, + "grad_norm": 1.616363525390625, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8927469849586487, + "num_tokens": 566709107.0, + "step": 15547 + }, + { + "epoch": 2.88727948003714, + "grad_norm": 1.6917498111724854, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8817836046218872, + "num_tokens": 566743135.0, + "step": 15548 + }, + { + "epoch": 2.887465181058496, + "grad_norm": 1.6328824758529663, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8723373413085938, + "num_tokens": 566781450.0, + "step": 15549 + }, + { + "epoch": 2.8876508820798517, + "grad_norm": 1.864862322807312, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8756927251815796, + "num_tokens": 566813586.0, + "step": 15550 + }, + { + "epoch": 2.887836583101207, + "grad_norm": 1.5587226152420044, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8831863403320312, + "num_tokens": 566852048.0, + "step": 15551 + }, + { + "epoch": 2.8880222841225627, + "grad_norm": 1.591193675994873, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8980647325515747, + "num_tokens": 566890302.0, + "step": 15552 + }, + { + "epoch": 2.8882079851439184, + "grad_norm": 1.507492184638977, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8755553364753723, + "num_tokens": 566931055.0, + "step": 15553 + }, + { + "epoch": 2.8883936861652737, + "grad_norm": 1.7384614944458008, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8770333528518677, + "num_tokens": 566967843.0, + "step": 15554 + }, + { + "epoch": 2.8885793871866294, + "grad_norm": 1.598731517791748, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8814102411270142, + "num_tokens": 567005585.0, + "step": 15555 + }, + { + "epoch": 2.888765088207985, + "grad_norm": 1.7414809465408325, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8926077485084534, + "num_tokens": 567036542.0, + "step": 15556 + }, + { + "epoch": 2.888950789229341, + "grad_norm": 1.624617338180542, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.883453369140625, + "num_tokens": 567073687.0, + "step": 15557 + }, + { + "epoch": 2.8891364902506966, + "grad_norm": 1.4736859798431396, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8954728841781616, + "num_tokens": 567113155.0, + "step": 15558 + }, + { + "epoch": 2.889322191272052, + "grad_norm": 1.4691523313522339, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8888391256332397, + "num_tokens": 567155844.0, + "step": 15559 + }, + { + "epoch": 2.8895078922934077, + "grad_norm": 1.5725040435791016, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8908539414405823, + "num_tokens": 567191436.0, + "step": 15560 + }, + { + "epoch": 2.8896935933147634, + "grad_norm": 1.6094454526901245, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.884650468826294, + "num_tokens": 567227983.0, + "step": 15561 + }, + { + "epoch": 2.8898792943361187, + "grad_norm": 1.4542502164840698, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8932799100875854, + "num_tokens": 567268158.0, + "step": 15562 + }, + { + "epoch": 2.8900649953574744, + "grad_norm": 1.4180591106414795, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.890221893787384, + "num_tokens": 567309310.0, + "step": 15563 + }, + { + "epoch": 2.89025069637883, + "grad_norm": 1.5510051250457764, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8833898305892944, + "num_tokens": 567346856.0, + "step": 15564 + }, + { + "epoch": 2.890436397400186, + "grad_norm": 1.8275032043457031, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8931410908699036, + "num_tokens": 567375782.0, + "step": 15565 + }, + { + "epoch": 2.890622098421541, + "grad_norm": 1.68739652633667, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8780503273010254, + "num_tokens": 567412013.0, + "step": 15566 + }, + { + "epoch": 2.890807799442897, + "grad_norm": 1.5287672281265259, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8971052169799805, + "num_tokens": 567450997.0, + "step": 15567 + }, + { + "epoch": 2.8909935004642526, + "grad_norm": 1.3971854448318481, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8849917054176331, + "num_tokens": 567497716.0, + "step": 15568 + }, + { + "epoch": 2.891179201485608, + "grad_norm": 1.5627232789993286, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8843033313751221, + "num_tokens": 567536640.0, + "step": 15569 + }, + { + "epoch": 2.8913649025069637, + "grad_norm": 1.7522203922271729, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8818155527114868, + "num_tokens": 567568140.0, + "step": 15570 + }, + { + "epoch": 2.8915506035283194, + "grad_norm": 1.5973623991012573, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.874947190284729, + "num_tokens": 567606251.0, + "step": 15571 + }, + { + "epoch": 2.891736304549675, + "grad_norm": 1.8393135070800781, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.872456967830658, + "num_tokens": 567637351.0, + "step": 15572 + }, + { + "epoch": 2.891922005571031, + "grad_norm": 1.531003713607788, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.896929144859314, + "num_tokens": 567676616.0, + "step": 15573 + }, + { + "epoch": 2.892107706592386, + "grad_norm": 1.7240102291107178, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8826688528060913, + "num_tokens": 567709971.0, + "step": 15574 + }, + { + "epoch": 2.892293407613742, + "grad_norm": 1.5616434812545776, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.890051543712616, + "num_tokens": 567746241.0, + "step": 15575 + }, + { + "epoch": 2.8924791086350976, + "grad_norm": 1.6069676876068115, + "learning_rate": 1e-06, + "loss": 0.2751, + "mean_token_accuracy": 0.9021726250648499, + "num_tokens": 567781989.0, + "step": 15576 + }, + { + "epoch": 2.892664809656453, + "grad_norm": 1.59365975856781, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8890414237976074, + "num_tokens": 567817320.0, + "step": 15577 + }, + { + "epoch": 2.8928505106778086, + "grad_norm": 1.6448386907577515, + "learning_rate": 1e-06, + "loss": 0.2608, + "mean_token_accuracy": 0.9032964706420898, + "num_tokens": 567846202.0, + "step": 15578 + }, + { + "epoch": 2.8930362116991644, + "grad_norm": 1.7194793224334717, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8729230165481567, + "num_tokens": 567880655.0, + "step": 15579 + }, + { + "epoch": 2.89322191272052, + "grad_norm": 1.5858359336853027, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8999619483947754, + "num_tokens": 567916803.0, + "step": 15580 + }, + { + "epoch": 2.893407613741876, + "grad_norm": 1.5771152973175049, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.897125780582428, + "num_tokens": 567951483.0, + "step": 15581 + }, + { + "epoch": 2.893593314763231, + "grad_norm": 1.4194614887237549, + "learning_rate": 1e-06, + "loss": 0.2753, + "mean_token_accuracy": 0.8990410566329956, + "num_tokens": 567996070.0, + "step": 15582 + }, + { + "epoch": 2.893779015784587, + "grad_norm": 1.5563549995422363, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8717750310897827, + "num_tokens": 568036053.0, + "step": 15583 + }, + { + "epoch": 2.8939647168059426, + "grad_norm": 1.500727653503418, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8843585848808289, + "num_tokens": 568077238.0, + "step": 15584 + }, + { + "epoch": 2.894150417827298, + "grad_norm": 1.587588906288147, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8844034671783447, + "num_tokens": 568117892.0, + "step": 15585 + }, + { + "epoch": 2.8943361188486536, + "grad_norm": 1.5113914012908936, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8904892206192017, + "num_tokens": 568154334.0, + "step": 15586 + }, + { + "epoch": 2.8945218198700093, + "grad_norm": 1.5520639419555664, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8958016633987427, + "num_tokens": 568189754.0, + "step": 15587 + }, + { + "epoch": 2.894707520891365, + "grad_norm": 1.5563534498214722, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8810749650001526, + "num_tokens": 568231389.0, + "step": 15588 + }, + { + "epoch": 2.8948932219127204, + "grad_norm": 1.63844633102417, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8792470693588257, + "num_tokens": 568270703.0, + "step": 15589 + }, + { + "epoch": 2.895078922934076, + "grad_norm": 1.5445348024368286, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8845884203910828, + "num_tokens": 568312434.0, + "step": 15590 + }, + { + "epoch": 2.895264623955432, + "grad_norm": 1.6455464363098145, + "learning_rate": 1e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.8977469801902771, + "num_tokens": 568345203.0, + "step": 15591 + }, + { + "epoch": 2.895450324976787, + "grad_norm": 1.5762338638305664, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8912514448165894, + "num_tokens": 568384354.0, + "step": 15592 + }, + { + "epoch": 2.895636025998143, + "grad_norm": 1.9816510677337646, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8866914510726929, + "num_tokens": 568416052.0, + "step": 15593 + }, + { + "epoch": 2.8958217270194986, + "grad_norm": 1.8405894041061401, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8904865980148315, + "num_tokens": 568447930.0, + "step": 15594 + }, + { + "epoch": 2.8960074280408543, + "grad_norm": 1.6118191480636597, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8758257627487183, + "num_tokens": 568486938.0, + "step": 15595 + }, + { + "epoch": 2.89619312906221, + "grad_norm": 1.5529091358184814, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8796474933624268, + "num_tokens": 568528364.0, + "step": 15596 + }, + { + "epoch": 2.8963788300835653, + "grad_norm": 1.5429435968399048, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8963463306427002, + "num_tokens": 568564546.0, + "step": 15597 + }, + { + "epoch": 2.896564531104921, + "grad_norm": 1.6658424139022827, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8891149759292603, + "num_tokens": 568600939.0, + "step": 15598 + }, + { + "epoch": 2.896750232126277, + "grad_norm": 1.4976806640625, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8943454623222351, + "num_tokens": 568642674.0, + "step": 15599 + }, + { + "epoch": 2.896935933147632, + "grad_norm": 1.7076884508132935, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.892423152923584, + "num_tokens": 568674224.0, + "step": 15600 + }, + { + "epoch": 2.897121634168988, + "grad_norm": 1.5073283910751343, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8794405460357666, + "num_tokens": 568717886.0, + "step": 15601 + }, + { + "epoch": 2.8973073351903436, + "grad_norm": 1.4123231172561646, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8969205617904663, + "num_tokens": 568759196.0, + "step": 15602 + }, + { + "epoch": 2.8974930362116993, + "grad_norm": 1.6805921792984009, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8937013149261475, + "num_tokens": 568791911.0, + "step": 15603 + }, + { + "epoch": 2.897678737233055, + "grad_norm": 1.609287977218628, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8932265043258667, + "num_tokens": 568829946.0, + "step": 15604 + }, + { + "epoch": 2.8978644382544103, + "grad_norm": 1.5848753452301025, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8912931680679321, + "num_tokens": 568865975.0, + "step": 15605 + }, + { + "epoch": 2.898050139275766, + "grad_norm": 1.5285365581512451, + "learning_rate": 1e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.9006975889205933, + "num_tokens": 568900400.0, + "step": 15606 + }, + { + "epoch": 2.898235840297122, + "grad_norm": 1.717374563217163, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8804904222488403, + "num_tokens": 568937177.0, + "step": 15607 + }, + { + "epoch": 2.898421541318477, + "grad_norm": 1.5452715158462524, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8912139534950256, + "num_tokens": 568973442.0, + "step": 15608 + }, + { + "epoch": 2.898607242339833, + "grad_norm": 1.4730839729309082, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8923463821411133, + "num_tokens": 569011906.0, + "step": 15609 + }, + { + "epoch": 2.8987929433611885, + "grad_norm": 1.6803573369979858, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8878150582313538, + "num_tokens": 569045174.0, + "step": 15610 + }, + { + "epoch": 2.8989786443825443, + "grad_norm": 1.7985262870788574, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8818932771682739, + "num_tokens": 569075729.0, + "step": 15611 + }, + { + "epoch": 2.8991643454038996, + "grad_norm": 1.5259172916412354, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8936160206794739, + "num_tokens": 569111735.0, + "step": 15612 + }, + { + "epoch": 2.8993500464252553, + "grad_norm": 1.5654343366622925, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8882114887237549, + "num_tokens": 569150168.0, + "step": 15613 + }, + { + "epoch": 2.899535747446611, + "grad_norm": 1.913006067276001, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8866885900497437, + "num_tokens": 569177992.0, + "step": 15614 + }, + { + "epoch": 2.8997214484679663, + "grad_norm": 1.6279727220535278, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8828859329223633, + "num_tokens": 569213611.0, + "step": 15615 + }, + { + "epoch": 2.899907149489322, + "grad_norm": 1.484673261642456, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8917753100395203, + "num_tokens": 569252782.0, + "step": 15616 + }, + { + "epoch": 2.9000928505106778, + "grad_norm": 1.5689599514007568, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8780791163444519, + "num_tokens": 569288565.0, + "step": 15617 + }, + { + "epoch": 2.9002785515320335, + "grad_norm": 1.7097034454345703, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8773571252822876, + "num_tokens": 569323571.0, + "step": 15618 + }, + { + "epoch": 2.9004642525533892, + "grad_norm": 1.700032353401184, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8881422281265259, + "num_tokens": 569357081.0, + "step": 15619 + }, + { + "epoch": 2.9006499535747445, + "grad_norm": 1.6014459133148193, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8939995765686035, + "num_tokens": 569391602.0, + "step": 15620 + }, + { + "epoch": 2.9008356545961003, + "grad_norm": 1.6196414232254028, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8923854827880859, + "num_tokens": 569425174.0, + "step": 15621 + }, + { + "epoch": 2.901021355617456, + "grad_norm": 1.6997497081756592, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8738715052604675, + "num_tokens": 569459671.0, + "step": 15622 + }, + { + "epoch": 2.9012070566388113, + "grad_norm": 1.7212947607040405, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8798575401306152, + "num_tokens": 569492146.0, + "step": 15623 + }, + { + "epoch": 2.901392757660167, + "grad_norm": 1.709731101989746, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8888202905654907, + "num_tokens": 569530846.0, + "step": 15624 + }, + { + "epoch": 2.9015784586815228, + "grad_norm": 1.7145373821258545, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8743739128112793, + "num_tokens": 569565138.0, + "step": 15625 + }, + { + "epoch": 2.9017641597028785, + "grad_norm": 1.5777199268341064, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8844777345657349, + "num_tokens": 569603032.0, + "step": 15626 + }, + { + "epoch": 2.901949860724234, + "grad_norm": 1.6631134748458862, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8838147521018982, + "num_tokens": 569637531.0, + "step": 15627 + }, + { + "epoch": 2.9021355617455895, + "grad_norm": 1.5675610303878784, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8896021246910095, + "num_tokens": 569672747.0, + "step": 15628 + }, + { + "epoch": 2.9023212627669452, + "grad_norm": 1.6429766416549683, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8960679769515991, + "num_tokens": 569704146.0, + "step": 15629 + }, + { + "epoch": 2.902506963788301, + "grad_norm": 1.6507971286773682, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8901699185371399, + "num_tokens": 569738104.0, + "step": 15630 + }, + { + "epoch": 2.9026926648096563, + "grad_norm": 1.6269934177398682, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8846734166145325, + "num_tokens": 569771909.0, + "step": 15631 + }, + { + "epoch": 2.902878365831012, + "grad_norm": 1.6326943635940552, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8938485980033875, + "num_tokens": 569805445.0, + "step": 15632 + }, + { + "epoch": 2.9030640668523677, + "grad_norm": 1.5588659048080444, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.8992123007774353, + "num_tokens": 569839285.0, + "step": 15633 + }, + { + "epoch": 2.9032497678737235, + "grad_norm": 1.6443111896514893, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8868606686592102, + "num_tokens": 569873233.0, + "step": 15634 + }, + { + "epoch": 2.903435468895079, + "grad_norm": 1.5928525924682617, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8732170462608337, + "num_tokens": 569913695.0, + "step": 15635 + }, + { + "epoch": 2.9036211699164345, + "grad_norm": 1.5607805252075195, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8781894445419312, + "num_tokens": 569951397.0, + "step": 15636 + }, + { + "epoch": 2.90380687093779, + "grad_norm": 1.486246943473816, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8950607776641846, + "num_tokens": 569992287.0, + "step": 15637 + }, + { + "epoch": 2.9039925719591455, + "grad_norm": 1.786739706993103, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8866478204727173, + "num_tokens": 570026402.0, + "step": 15638 + }, + { + "epoch": 2.9041782729805012, + "grad_norm": 1.4539738893508911, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.900409460067749, + "num_tokens": 570064080.0, + "step": 15639 + }, + { + "epoch": 2.904363974001857, + "grad_norm": 1.6655958890914917, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8814637064933777, + "num_tokens": 570098917.0, + "step": 15640 + }, + { + "epoch": 2.9045496750232127, + "grad_norm": 1.7249929904937744, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8894058465957642, + "num_tokens": 570131787.0, + "step": 15641 + }, + { + "epoch": 2.9047353760445684, + "grad_norm": 1.5393813848495483, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8807879686355591, + "num_tokens": 570174293.0, + "step": 15642 + }, + { + "epoch": 2.9049210770659237, + "grad_norm": 1.6330251693725586, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8849637508392334, + "num_tokens": 570210343.0, + "step": 15643 + }, + { + "epoch": 2.9051067780872795, + "grad_norm": 1.5373555421829224, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8847215175628662, + "num_tokens": 570252054.0, + "step": 15644 + }, + { + "epoch": 2.905292479108635, + "grad_norm": 1.5905146598815918, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8888242244720459, + "num_tokens": 570286865.0, + "step": 15645 + }, + { + "epoch": 2.9054781801299905, + "grad_norm": 1.6322177648544312, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.889494776725769, + "num_tokens": 570324631.0, + "step": 15646 + }, + { + "epoch": 2.905663881151346, + "grad_norm": 1.8523051738739014, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8721530437469482, + "num_tokens": 570354965.0, + "step": 15647 + }, + { + "epoch": 2.905849582172702, + "grad_norm": 1.4442628622055054, + "learning_rate": 1e-06, + "loss": 0.2721, + "mean_token_accuracy": 0.9008356928825378, + "num_tokens": 570392991.0, + "step": 15648 + }, + { + "epoch": 2.9060352831940577, + "grad_norm": 1.6403770446777344, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8834805488586426, + "num_tokens": 570428524.0, + "step": 15649 + }, + { + "epoch": 2.9062209842154134, + "grad_norm": 1.5394843816757202, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8946738243103027, + "num_tokens": 570465076.0, + "step": 15650 + }, + { + "epoch": 2.9064066852367687, + "grad_norm": 1.6917393207550049, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8936163187026978, + "num_tokens": 570497042.0, + "step": 15651 + }, + { + "epoch": 2.9065923862581244, + "grad_norm": 1.5819432735443115, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8879880309104919, + "num_tokens": 570533686.0, + "step": 15652 + }, + { + "epoch": 2.90677808727948, + "grad_norm": 1.7054215669631958, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8912461400032043, + "num_tokens": 570566868.0, + "step": 15653 + }, + { + "epoch": 2.9069637883008355, + "grad_norm": 1.6853424310684204, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8807510137557983, + "num_tokens": 570599005.0, + "step": 15654 + }, + { + "epoch": 2.907149489322191, + "grad_norm": 1.5921581983566284, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8746830224990845, + "num_tokens": 570639482.0, + "step": 15655 + }, + { + "epoch": 2.907335190343547, + "grad_norm": 1.5767658948898315, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.900627076625824, + "num_tokens": 570674346.0, + "step": 15656 + }, + { + "epoch": 2.9075208913649027, + "grad_norm": 1.8622459173202515, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8851332664489746, + "num_tokens": 570704159.0, + "step": 15657 + }, + { + "epoch": 2.9077065923862584, + "grad_norm": 1.6155165433883667, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8841042518615723, + "num_tokens": 570739898.0, + "step": 15658 + }, + { + "epoch": 2.9078922934076137, + "grad_norm": 1.694004774093628, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8819723129272461, + "num_tokens": 570773633.0, + "step": 15659 + }, + { + "epoch": 2.9080779944289694, + "grad_norm": 1.5447003841400146, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8933234214782715, + "num_tokens": 570818453.0, + "step": 15660 + }, + { + "epoch": 2.9082636954503247, + "grad_norm": 1.5902676582336426, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8979083299636841, + "num_tokens": 570855557.0, + "step": 15661 + }, + { + "epoch": 2.9084493964716804, + "grad_norm": 1.5741890668869019, + "learning_rate": 1e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9013997912406921, + "num_tokens": 570889949.0, + "step": 15662 + }, + { + "epoch": 2.908635097493036, + "grad_norm": 1.4969874620437622, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.883145272731781, + "num_tokens": 570930486.0, + "step": 15663 + }, + { + "epoch": 2.908820798514392, + "grad_norm": 1.616485595703125, + "learning_rate": 1e-06, + "loss": 0.2648, + "mean_token_accuracy": 0.9033674597740173, + "num_tokens": 570964512.0, + "step": 15664 + }, + { + "epoch": 2.9090064995357476, + "grad_norm": 1.377983808517456, + "learning_rate": 1e-06, + "loss": 0.2642, + "mean_token_accuracy": 0.9024869203567505, + "num_tokens": 571005372.0, + "step": 15665 + }, + { + "epoch": 2.909192200557103, + "grad_norm": 1.6232837438583374, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.892052173614502, + "num_tokens": 571039825.0, + "step": 15666 + }, + { + "epoch": 2.9093779015784587, + "grad_norm": 1.658558964729309, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8787180185317993, + "num_tokens": 571080401.0, + "step": 15667 + }, + { + "epoch": 2.9095636025998144, + "grad_norm": 1.6639668941497803, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8948578834533691, + "num_tokens": 571115829.0, + "step": 15668 + }, + { + "epoch": 2.9097493036211697, + "grad_norm": 1.5586497783660889, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8937073945999146, + "num_tokens": 571151909.0, + "step": 15669 + }, + { + "epoch": 2.9099350046425254, + "grad_norm": 1.4300267696380615, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8973554372787476, + "num_tokens": 571192120.0, + "step": 15670 + }, + { + "epoch": 2.910120705663881, + "grad_norm": 1.6768726110458374, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8770490288734436, + "num_tokens": 571229991.0, + "step": 15671 + }, + { + "epoch": 2.910306406685237, + "grad_norm": 1.6053858995437622, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8916224837303162, + "num_tokens": 571267785.0, + "step": 15672 + }, + { + "epoch": 2.9104921077065926, + "grad_norm": 1.773052453994751, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.879788875579834, + "num_tokens": 571303800.0, + "step": 15673 + }, + { + "epoch": 2.910677808727948, + "grad_norm": 1.7507802248001099, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8810043334960938, + "num_tokens": 571339060.0, + "step": 15674 + }, + { + "epoch": 2.9108635097493036, + "grad_norm": 1.6187490224838257, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8717035055160522, + "num_tokens": 571374965.0, + "step": 15675 + }, + { + "epoch": 2.9110492107706594, + "grad_norm": 1.7035366296768188, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8916835188865662, + "num_tokens": 571407293.0, + "step": 15676 + }, + { + "epoch": 2.9112349117920147, + "grad_norm": 1.6757545471191406, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8742282390594482, + "num_tokens": 571444077.0, + "step": 15677 + }, + { + "epoch": 2.9114206128133704, + "grad_norm": 1.5964810848236084, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8783998489379883, + "num_tokens": 571482657.0, + "step": 15678 + }, + { + "epoch": 2.911606313834726, + "grad_norm": 1.6970462799072266, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8857313394546509, + "num_tokens": 571516707.0, + "step": 15679 + }, + { + "epoch": 2.911792014856082, + "grad_norm": 1.5649001598358154, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8905357122421265, + "num_tokens": 571551864.0, + "step": 15680 + }, + { + "epoch": 2.9119777158774376, + "grad_norm": 1.7661620378494263, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8792446851730347, + "num_tokens": 571586259.0, + "step": 15681 + }, + { + "epoch": 2.912163416898793, + "grad_norm": 1.8621217012405396, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8888860940933228, + "num_tokens": 571616819.0, + "step": 15682 + }, + { + "epoch": 2.9123491179201486, + "grad_norm": 1.6123039722442627, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8883506655693054, + "num_tokens": 571652681.0, + "step": 15683 + }, + { + "epoch": 2.9125348189415043, + "grad_norm": 1.660438895225525, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8955681324005127, + "num_tokens": 571686027.0, + "step": 15684 + }, + { + "epoch": 2.9127205199628596, + "grad_norm": 1.5436948537826538, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.892325758934021, + "num_tokens": 571722106.0, + "step": 15685 + }, + { + "epoch": 2.9129062209842154, + "grad_norm": 1.6695301532745361, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8827028274536133, + "num_tokens": 571753482.0, + "step": 15686 + }, + { + "epoch": 2.913091922005571, + "grad_norm": 1.7789206504821777, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8836352229118347, + "num_tokens": 571784390.0, + "step": 15687 + }, + { + "epoch": 2.913277623026927, + "grad_norm": 1.6565476655960083, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8882676959037781, + "num_tokens": 571822079.0, + "step": 15688 + }, + { + "epoch": 2.913463324048282, + "grad_norm": 1.6954110860824585, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8896058797836304, + "num_tokens": 571858541.0, + "step": 15689 + }, + { + "epoch": 2.913649025069638, + "grad_norm": 1.4967374801635742, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8895521759986877, + "num_tokens": 571898738.0, + "step": 15690 + }, + { + "epoch": 2.9138347260909936, + "grad_norm": 1.5243359804153442, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8908538222312927, + "num_tokens": 571934845.0, + "step": 15691 + }, + { + "epoch": 2.914020427112349, + "grad_norm": 1.6362173557281494, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8797918558120728, + "num_tokens": 571968017.0, + "step": 15692 + }, + { + "epoch": 2.9142061281337046, + "grad_norm": 1.5320736169815063, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8899412155151367, + "num_tokens": 572008667.0, + "step": 15693 + }, + { + "epoch": 2.9143918291550603, + "grad_norm": 1.607763648033142, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8854587078094482, + "num_tokens": 572044691.0, + "step": 15694 + }, + { + "epoch": 2.914577530176416, + "grad_norm": 1.7274608612060547, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8672324419021606, + "num_tokens": 572081198.0, + "step": 15695 + }, + { + "epoch": 2.914763231197772, + "grad_norm": 1.5878490209579468, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8969314098358154, + "num_tokens": 572115108.0, + "step": 15696 + }, + { + "epoch": 2.914948932219127, + "grad_norm": 1.5481128692626953, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8906888961791992, + "num_tokens": 572155449.0, + "step": 15697 + }, + { + "epoch": 2.915134633240483, + "grad_norm": 1.6402086019515991, + "learning_rate": 1e-06, + "loss": 0.2638, + "mean_token_accuracy": 0.9032650589942932, + "num_tokens": 572185428.0, + "step": 15698 + }, + { + "epoch": 2.9153203342618386, + "grad_norm": 1.500809669494629, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8818766474723816, + "num_tokens": 572231280.0, + "step": 15699 + }, + { + "epoch": 2.915506035283194, + "grad_norm": 1.7874326705932617, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8851338028907776, + "num_tokens": 572266694.0, + "step": 15700 + }, + { + "epoch": 2.9156917363045496, + "grad_norm": 1.4767264127731323, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.892356812953949, + "num_tokens": 572307245.0, + "step": 15701 + }, + { + "epoch": 2.9158774373259053, + "grad_norm": 1.7247315645217896, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8932303190231323, + "num_tokens": 572340547.0, + "step": 15702 + }, + { + "epoch": 2.916063138347261, + "grad_norm": 1.6213068962097168, + "learning_rate": 1e-06, + "loss": 0.2755, + "mean_token_accuracy": 0.9013890027999878, + "num_tokens": 572373593.0, + "step": 15703 + }, + { + "epoch": 2.916248839368617, + "grad_norm": 1.649426817893982, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8806389570236206, + "num_tokens": 572409216.0, + "step": 15704 + }, + { + "epoch": 2.916434540389972, + "grad_norm": 1.592142939567566, + "learning_rate": 1e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9022620916366577, + "num_tokens": 572445819.0, + "step": 15705 + }, + { + "epoch": 2.916620241411328, + "grad_norm": 1.7013611793518066, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.880483865737915, + "num_tokens": 572485457.0, + "step": 15706 + }, + { + "epoch": 2.9168059424326835, + "grad_norm": 1.5236256122589111, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8896822929382324, + "num_tokens": 572522945.0, + "step": 15707 + }, + { + "epoch": 2.916991643454039, + "grad_norm": 1.7508270740509033, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8745869398117065, + "num_tokens": 572558877.0, + "step": 15708 + }, + { + "epoch": 2.9171773444753946, + "grad_norm": 1.759842872619629, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8851807117462158, + "num_tokens": 572588192.0, + "step": 15709 + }, + { + "epoch": 2.9173630454967503, + "grad_norm": 1.4142587184906006, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8932929635047913, + "num_tokens": 572628367.0, + "step": 15710 + }, + { + "epoch": 2.917548746518106, + "grad_norm": 1.6694492101669312, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8727014660835266, + "num_tokens": 572665893.0, + "step": 15711 + }, + { + "epoch": 2.9177344475394613, + "grad_norm": 1.450137972831726, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8997381925582886, + "num_tokens": 572706465.0, + "step": 15712 + }, + { + "epoch": 2.917920148560817, + "grad_norm": 1.5966728925704956, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8810861706733704, + "num_tokens": 572746057.0, + "step": 15713 + }, + { + "epoch": 2.918105849582173, + "grad_norm": 1.6998134851455688, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8754254579544067, + "num_tokens": 572783969.0, + "step": 15714 + }, + { + "epoch": 2.918291550603528, + "grad_norm": 1.5358489751815796, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8859683275222778, + "num_tokens": 572826270.0, + "step": 15715 + }, + { + "epoch": 2.918477251624884, + "grad_norm": 1.674946665763855, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8643949031829834, + "num_tokens": 572863821.0, + "step": 15716 + }, + { + "epoch": 2.9186629526462395, + "grad_norm": 1.6159158945083618, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.884846568107605, + "num_tokens": 572899761.0, + "step": 15717 + }, + { + "epoch": 2.9188486536675953, + "grad_norm": 1.472487449645996, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.893145740032196, + "num_tokens": 572939229.0, + "step": 15718 + }, + { + "epoch": 2.919034354688951, + "grad_norm": 1.5994006395339966, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8982657194137573, + "num_tokens": 572976803.0, + "step": 15719 + }, + { + "epoch": 2.9192200557103063, + "grad_norm": 1.7894558906555176, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8735145330429077, + "num_tokens": 573010761.0, + "step": 15720 + }, + { + "epoch": 2.919405756731662, + "grad_norm": 1.7971805334091187, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.886732816696167, + "num_tokens": 573039267.0, + "step": 15721 + }, + { + "epoch": 2.9195914577530178, + "grad_norm": 1.5496350526809692, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8904991745948792, + "num_tokens": 573076448.0, + "step": 15722 + }, + { + "epoch": 2.919777158774373, + "grad_norm": 1.5368046760559082, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8850071430206299, + "num_tokens": 573114699.0, + "step": 15723 + }, + { + "epoch": 2.919962859795729, + "grad_norm": 1.7097479104995728, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8857529759407043, + "num_tokens": 573148214.0, + "step": 15724 + }, + { + "epoch": 2.9201485608170845, + "grad_norm": 1.4728575944900513, + "learning_rate": 1e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.8960238099098206, + "num_tokens": 573189889.0, + "step": 15725 + }, + { + "epoch": 2.9203342618384402, + "grad_norm": 1.6225651502609253, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8848399519920349, + "num_tokens": 573227649.0, + "step": 15726 + }, + { + "epoch": 2.920519962859796, + "grad_norm": 1.6743721961975098, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8792396187782288, + "num_tokens": 573263003.0, + "step": 15727 + }, + { + "epoch": 2.9207056638811513, + "grad_norm": 1.4668481349945068, + "learning_rate": 1e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.900850236415863, + "num_tokens": 573300467.0, + "step": 15728 + }, + { + "epoch": 2.920891364902507, + "grad_norm": 1.6025956869125366, + "learning_rate": 1e-06, + "loss": 0.278, + "mean_token_accuracy": 0.8992807269096375, + "num_tokens": 573336567.0, + "step": 15729 + }, + { + "epoch": 2.9210770659238627, + "grad_norm": 1.685559630393982, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.886638879776001, + "num_tokens": 573371139.0, + "step": 15730 + }, + { + "epoch": 2.921262766945218, + "grad_norm": 1.5185421705245972, + "learning_rate": 1e-06, + "loss": 0.2716, + "mean_token_accuracy": 0.9028005003929138, + "num_tokens": 573409444.0, + "step": 15731 + }, + { + "epoch": 2.9214484679665738, + "grad_norm": 1.5835707187652588, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8768221139907837, + "num_tokens": 573447661.0, + "step": 15732 + }, + { + "epoch": 2.9216341689879295, + "grad_norm": 1.6628831624984741, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8922011852264404, + "num_tokens": 573484726.0, + "step": 15733 + }, + { + "epoch": 2.9218198700092852, + "grad_norm": 1.5205271244049072, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.882723331451416, + "num_tokens": 573526030.0, + "step": 15734 + }, + { + "epoch": 2.9220055710306405, + "grad_norm": 2.0253188610076904, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8615841865539551, + "num_tokens": 573555166.0, + "step": 15735 + }, + { + "epoch": 2.9221912720519962, + "grad_norm": 1.682707667350769, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9002014994621277, + "num_tokens": 573589847.0, + "step": 15736 + }, + { + "epoch": 2.922376973073352, + "grad_norm": 1.7037392854690552, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8872448205947876, + "num_tokens": 573623379.0, + "step": 15737 + }, + { + "epoch": 2.9225626740947073, + "grad_norm": 1.6014496088027954, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8976008892059326, + "num_tokens": 573656041.0, + "step": 15738 + }, + { + "epoch": 2.922748375116063, + "grad_norm": 1.7117228507995605, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8951598405838013, + "num_tokens": 573687012.0, + "step": 15739 + }, + { + "epoch": 2.9229340761374187, + "grad_norm": 1.6007001399993896, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8854131698608398, + "num_tokens": 573724041.0, + "step": 15740 + }, + { + "epoch": 2.9231197771587745, + "grad_norm": 1.5830624103546143, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.884118914604187, + "num_tokens": 573762262.0, + "step": 15741 + }, + { + "epoch": 2.92330547818013, + "grad_norm": 1.6132220029830933, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8904644250869751, + "num_tokens": 573798135.0, + "step": 15742 + }, + { + "epoch": 2.9234911792014855, + "grad_norm": 1.5602505207061768, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8957512378692627, + "num_tokens": 573836173.0, + "step": 15743 + }, + { + "epoch": 2.9236768802228412, + "grad_norm": 1.6208510398864746, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8847167491912842, + "num_tokens": 573873531.0, + "step": 15744 + }, + { + "epoch": 2.923862581244197, + "grad_norm": 1.6354461908340454, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8903179168701172, + "num_tokens": 573909837.0, + "step": 15745 + }, + { + "epoch": 2.9240482822655522, + "grad_norm": 1.6297783851623535, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8833287954330444, + "num_tokens": 573946886.0, + "step": 15746 + }, + { + "epoch": 2.924233983286908, + "grad_norm": 1.5504193305969238, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8860028982162476, + "num_tokens": 573986380.0, + "step": 15747 + }, + { + "epoch": 2.9244196843082637, + "grad_norm": 1.629024624824524, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8897099494934082, + "num_tokens": 574023283.0, + "step": 15748 + }, + { + "epoch": 2.9246053853296194, + "grad_norm": 1.7431085109710693, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8810951709747314, + "num_tokens": 574055551.0, + "step": 15749 + }, + { + "epoch": 2.924791086350975, + "grad_norm": 1.6578434705734253, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8811508417129517, + "num_tokens": 574094097.0, + "step": 15750 + }, + { + "epoch": 2.9249767873723305, + "grad_norm": 1.6561881303787231, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8814632296562195, + "num_tokens": 574128132.0, + "step": 15751 + }, + { + "epoch": 2.925162488393686, + "grad_norm": 1.664526343345642, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8946865200996399, + "num_tokens": 574163215.0, + "step": 15752 + }, + { + "epoch": 2.925348189415042, + "grad_norm": 1.6298174858093262, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8925605416297913, + "num_tokens": 574197228.0, + "step": 15753 + }, + { + "epoch": 2.925533890436397, + "grad_norm": 1.6260501146316528, + "learning_rate": 1e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.8982079029083252, + "num_tokens": 574232500.0, + "step": 15754 + }, + { + "epoch": 2.925719591457753, + "grad_norm": 1.716900110244751, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.886520504951477, + "num_tokens": 574268001.0, + "step": 15755 + }, + { + "epoch": 2.9259052924791087, + "grad_norm": 1.7859572172164917, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8667603135108948, + "num_tokens": 574303173.0, + "step": 15756 + }, + { + "epoch": 2.9260909935004644, + "grad_norm": 1.4770328998565674, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8782786130905151, + "num_tokens": 574347325.0, + "step": 15757 + }, + { + "epoch": 2.9262766945218197, + "grad_norm": 1.6620410680770874, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8717913627624512, + "num_tokens": 574390045.0, + "step": 15758 + }, + { + "epoch": 2.9264623955431754, + "grad_norm": 1.8947237730026245, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8914647102355957, + "num_tokens": 574419402.0, + "step": 15759 + }, + { + "epoch": 2.926648096564531, + "grad_norm": 1.4891246557235718, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8726044297218323, + "num_tokens": 574464124.0, + "step": 15760 + }, + { + "epoch": 2.9268337975858865, + "grad_norm": 1.5395063161849976, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.886490523815155, + "num_tokens": 574500411.0, + "step": 15761 + }, + { + "epoch": 2.927019498607242, + "grad_norm": 1.493199110031128, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8878106474876404, + "num_tokens": 574537889.0, + "step": 15762 + }, + { + "epoch": 2.927205199628598, + "grad_norm": 1.4780339002609253, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8885505199432373, + "num_tokens": 574576922.0, + "step": 15763 + }, + { + "epoch": 2.9273909006499537, + "grad_norm": 1.597688913345337, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8916018009185791, + "num_tokens": 574609831.0, + "step": 15764 + }, + { + "epoch": 2.9275766016713094, + "grad_norm": 1.4922312498092651, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8852390646934509, + "num_tokens": 574648203.0, + "step": 15765 + }, + { + "epoch": 2.9277623026926647, + "grad_norm": 1.381170392036438, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8783680200576782, + "num_tokens": 574696342.0, + "step": 15766 + }, + { + "epoch": 2.9279480037140204, + "grad_norm": 1.8347464799880981, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8794294595718384, + "num_tokens": 574727828.0, + "step": 15767 + }, + { + "epoch": 2.928133704735376, + "grad_norm": 1.5280495882034302, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8957206010818481, + "num_tokens": 574766473.0, + "step": 15768 + }, + { + "epoch": 2.9283194057567314, + "grad_norm": 1.6209741830825806, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8814537525177002, + "num_tokens": 574807322.0, + "step": 15769 + }, + { + "epoch": 2.928505106778087, + "grad_norm": 1.7282638549804688, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8926457166671753, + "num_tokens": 574840399.0, + "step": 15770 + }, + { + "epoch": 2.928690807799443, + "grad_norm": 1.7766108512878418, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8910278677940369, + "num_tokens": 574870752.0, + "step": 15771 + }, + { + "epoch": 2.9288765088207986, + "grad_norm": 1.5794926881790161, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.878084659576416, + "num_tokens": 574911355.0, + "step": 15772 + }, + { + "epoch": 2.9290622098421544, + "grad_norm": 1.7864903211593628, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8798882365226746, + "num_tokens": 574944480.0, + "step": 15773 + }, + { + "epoch": 2.9292479108635097, + "grad_norm": 1.4850233793258667, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8909909129142761, + "num_tokens": 574985275.0, + "step": 15774 + }, + { + "epoch": 2.9294336118848654, + "grad_norm": 1.5951838493347168, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8899321556091309, + "num_tokens": 575020671.0, + "step": 15775 + }, + { + "epoch": 2.929619312906221, + "grad_norm": 1.6143732070922852, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8813301920890808, + "num_tokens": 575058078.0, + "step": 15776 + }, + { + "epoch": 2.9298050139275764, + "grad_norm": 1.8647122383117676, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8681002855300903, + "num_tokens": 575096617.0, + "step": 15777 + }, + { + "epoch": 2.929990714948932, + "grad_norm": 1.6163885593414307, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8780788779258728, + "num_tokens": 575139332.0, + "step": 15778 + }, + { + "epoch": 2.930176415970288, + "grad_norm": 1.5743316411972046, + "learning_rate": 1e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.8985072374343872, + "num_tokens": 575175772.0, + "step": 15779 + }, + { + "epoch": 2.9303621169916436, + "grad_norm": 1.5208818912506104, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8938170075416565, + "num_tokens": 575216059.0, + "step": 15780 + }, + { + "epoch": 2.930547818012999, + "grad_norm": 1.481796145439148, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8884458541870117, + "num_tokens": 575256693.0, + "step": 15781 + }, + { + "epoch": 2.9307335190343546, + "grad_norm": 1.6336638927459717, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8935301303863525, + "num_tokens": 575290439.0, + "step": 15782 + }, + { + "epoch": 2.9309192200557104, + "grad_norm": 1.455747127532959, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.8980329036712646, + "num_tokens": 575331276.0, + "step": 15783 + }, + { + "epoch": 2.9311049210770657, + "grad_norm": 1.6512638330459595, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8872174620628357, + "num_tokens": 575365311.0, + "step": 15784 + }, + { + "epoch": 2.9312906220984214, + "grad_norm": 1.7160385847091675, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8771568536758423, + "num_tokens": 575401060.0, + "step": 15785 + }, + { + "epoch": 2.931476323119777, + "grad_norm": 1.4314788579940796, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8902597427368164, + "num_tokens": 575444265.0, + "step": 15786 + }, + { + "epoch": 2.931662024141133, + "grad_norm": 1.5949864387512207, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8944650888442993, + "num_tokens": 575480829.0, + "step": 15787 + }, + { + "epoch": 2.9318477251624886, + "grad_norm": 1.5979211330413818, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.891313910484314, + "num_tokens": 575519228.0, + "step": 15788 + }, + { + "epoch": 2.932033426183844, + "grad_norm": 1.5980840921401978, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8707406520843506, + "num_tokens": 575558209.0, + "step": 15789 + }, + { + "epoch": 2.9322191272051996, + "grad_norm": 1.5187703371047974, + "learning_rate": 1e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.9007604718208313, + "num_tokens": 575594480.0, + "step": 15790 + }, + { + "epoch": 2.9324048282265553, + "grad_norm": 1.626009464263916, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.888993501663208, + "num_tokens": 575625350.0, + "step": 15791 + }, + { + "epoch": 2.9325905292479106, + "grad_norm": 1.628792405128479, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8962842226028442, + "num_tokens": 575656380.0, + "step": 15792 + }, + { + "epoch": 2.9327762302692664, + "grad_norm": 1.6544275283813477, + "learning_rate": 1e-06, + "loss": 0.2804, + "mean_token_accuracy": 0.898780345916748, + "num_tokens": 575688949.0, + "step": 15793 + }, + { + "epoch": 2.932961931290622, + "grad_norm": 1.5903252363204956, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8944187164306641, + "num_tokens": 575724213.0, + "step": 15794 + }, + { + "epoch": 2.933147632311978, + "grad_norm": 1.5725408792495728, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.886974573135376, + "num_tokens": 575765682.0, + "step": 15795 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 1.5696617364883423, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8787273168563843, + "num_tokens": 575805507.0, + "step": 15796 + }, + { + "epoch": 2.933519034354689, + "grad_norm": 1.4332818984985352, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8840293884277344, + "num_tokens": 575846830.0, + "step": 15797 + }, + { + "epoch": 2.9337047353760446, + "grad_norm": 1.6843112707138062, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8869671821594238, + "num_tokens": 575882139.0, + "step": 15798 + }, + { + "epoch": 2.9338904363974003, + "grad_norm": 1.4507489204406738, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8821824193000793, + "num_tokens": 575926518.0, + "step": 15799 + }, + { + "epoch": 2.9340761374187556, + "grad_norm": 1.6130318641662598, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.883692741394043, + "num_tokens": 575962179.0, + "step": 15800 + }, + { + "epoch": 2.9342618384401113, + "grad_norm": 1.6301313638687134, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8850400447845459, + "num_tokens": 575998557.0, + "step": 15801 + }, + { + "epoch": 2.934447539461467, + "grad_norm": 1.6790878772735596, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8908014297485352, + "num_tokens": 576034882.0, + "step": 15802 + }, + { + "epoch": 2.934633240482823, + "grad_norm": 1.39052414894104, + "learning_rate": 1e-06, + "loss": 0.2639, + "mean_token_accuracy": 0.9022025465965271, + "num_tokens": 576076336.0, + "step": 15803 + }, + { + "epoch": 2.9348189415041785, + "grad_norm": 1.5250210762023926, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8781402111053467, + "num_tokens": 576118246.0, + "step": 15804 + }, + { + "epoch": 2.935004642525534, + "grad_norm": 1.6648818254470825, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8828392028808594, + "num_tokens": 576154115.0, + "step": 15805 + }, + { + "epoch": 2.9351903435468896, + "grad_norm": 1.706830620765686, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8787861466407776, + "num_tokens": 576184454.0, + "step": 15806 + }, + { + "epoch": 2.935376044568245, + "grad_norm": 1.9471219778060913, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8950284719467163, + "num_tokens": 576217604.0, + "step": 15807 + }, + { + "epoch": 2.9355617455896006, + "grad_norm": 1.7867152690887451, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8961907625198364, + "num_tokens": 576253883.0, + "step": 15808 + }, + { + "epoch": 2.9357474466109563, + "grad_norm": 1.636101245880127, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8884056210517883, + "num_tokens": 576289722.0, + "step": 15809 + }, + { + "epoch": 2.935933147632312, + "grad_norm": 1.4495141506195068, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8925098180770874, + "num_tokens": 576331610.0, + "step": 15810 + }, + { + "epoch": 2.936118848653668, + "grad_norm": 1.3915884494781494, + "learning_rate": 1e-06, + "loss": 0.2594, + "mean_token_accuracy": 0.9036111831665039, + "num_tokens": 576370199.0, + "step": 15811 + }, + { + "epoch": 2.936304549675023, + "grad_norm": 1.5847407579421997, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8857574462890625, + "num_tokens": 576408400.0, + "step": 15812 + }, + { + "epoch": 2.936490250696379, + "grad_norm": 1.6778945922851562, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8814055323600769, + "num_tokens": 576441659.0, + "step": 15813 + }, + { + "epoch": 2.9366759517177345, + "grad_norm": 1.7108995914459229, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8767921924591064, + "num_tokens": 576476605.0, + "step": 15814 + }, + { + "epoch": 2.93686165273909, + "grad_norm": 1.723344087600708, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8850464820861816, + "num_tokens": 576512242.0, + "step": 15815 + }, + { + "epoch": 2.9370473537604456, + "grad_norm": 1.6505309343338013, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8869085907936096, + "num_tokens": 576547582.0, + "step": 15816 + }, + { + "epoch": 2.9372330547818013, + "grad_norm": 1.4884059429168701, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.895683765411377, + "num_tokens": 576584735.0, + "step": 15817 + }, + { + "epoch": 2.937418755803157, + "grad_norm": 1.593281865119934, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8904189467430115, + "num_tokens": 576618114.0, + "step": 15818 + }, + { + "epoch": 2.9376044568245128, + "grad_norm": 1.5147485733032227, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8871929049491882, + "num_tokens": 576654488.0, + "step": 15819 + }, + { + "epoch": 2.937790157845868, + "grad_norm": 1.7300785779953003, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8740572333335876, + "num_tokens": 576691039.0, + "step": 15820 + }, + { + "epoch": 2.937975858867224, + "grad_norm": 1.5440458059310913, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8885401487350464, + "num_tokens": 576732156.0, + "step": 15821 + }, + { + "epoch": 2.9381615598885795, + "grad_norm": 1.5988898277282715, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8875714540481567, + "num_tokens": 576769288.0, + "step": 15822 + }, + { + "epoch": 2.938347260909935, + "grad_norm": 1.4955506324768066, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.889394998550415, + "num_tokens": 576810922.0, + "step": 15823 + }, + { + "epoch": 2.9385329619312905, + "grad_norm": 1.6578361988067627, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.884865403175354, + "num_tokens": 576846105.0, + "step": 15824 + }, + { + "epoch": 2.9387186629526463, + "grad_norm": 1.5011714696884155, + "learning_rate": 1e-06, + "loss": 0.2502, + "mean_token_accuracy": 0.9068630933761597, + "num_tokens": 576884709.0, + "step": 15825 + }, + { + "epoch": 2.938904363974002, + "grad_norm": 1.6354589462280273, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8890988826751709, + "num_tokens": 576920333.0, + "step": 15826 + }, + { + "epoch": 2.9390900649953577, + "grad_norm": 1.626613974571228, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8827012777328491, + "num_tokens": 576957616.0, + "step": 15827 + }, + { + "epoch": 2.939275766016713, + "grad_norm": 1.4967467784881592, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8883680701255798, + "num_tokens": 576998423.0, + "step": 15828 + }, + { + "epoch": 2.9394614670380688, + "grad_norm": 1.5511422157287598, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.899307370185852, + "num_tokens": 577032739.0, + "step": 15829 + }, + { + "epoch": 2.939647168059424, + "grad_norm": 1.4809913635253906, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8750250339508057, + "num_tokens": 577075032.0, + "step": 15830 + }, + { + "epoch": 2.93983286908078, + "grad_norm": 1.6445844173431396, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8835741281509399, + "num_tokens": 577112380.0, + "step": 15831 + }, + { + "epoch": 2.9400185701021355, + "grad_norm": 1.4900075197219849, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8930138945579529, + "num_tokens": 577152674.0, + "step": 15832 + }, + { + "epoch": 2.9402042711234913, + "grad_norm": 1.6984690427780151, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.875099778175354, + "num_tokens": 577187066.0, + "step": 15833 + }, + { + "epoch": 2.940389972144847, + "grad_norm": 1.5109509229660034, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8894886374473572, + "num_tokens": 577225389.0, + "step": 15834 + }, + { + "epoch": 2.9405756731662023, + "grad_norm": 1.6690646409988403, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8689479827880859, + "num_tokens": 577261872.0, + "step": 15835 + }, + { + "epoch": 2.940761374187558, + "grad_norm": 1.4994072914123535, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.881254255771637, + "num_tokens": 577306734.0, + "step": 15836 + }, + { + "epoch": 2.9409470752089137, + "grad_norm": 1.5622732639312744, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8919670581817627, + "num_tokens": 577342373.0, + "step": 15837 + }, + { + "epoch": 2.941132776230269, + "grad_norm": 1.6260398626327515, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8773639798164368, + "num_tokens": 577379399.0, + "step": 15838 + }, + { + "epoch": 2.9413184772516248, + "grad_norm": 1.734802484512329, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8677307963371277, + "num_tokens": 577413325.0, + "step": 15839 + }, + { + "epoch": 2.9415041782729805, + "grad_norm": 1.583904504776001, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8867871761322021, + "num_tokens": 577448762.0, + "step": 15840 + }, + { + "epoch": 2.9416898792943362, + "grad_norm": 1.5858240127563477, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8912485837936401, + "num_tokens": 577483040.0, + "step": 15841 + }, + { + "epoch": 2.941875580315692, + "grad_norm": 1.4880144596099854, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8916304111480713, + "num_tokens": 577522086.0, + "step": 15842 + }, + { + "epoch": 2.9420612813370473, + "grad_norm": 1.5734593868255615, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8876713514328003, + "num_tokens": 577561888.0, + "step": 15843 + }, + { + "epoch": 2.942246982358403, + "grad_norm": 1.745066523551941, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8911795020103455, + "num_tokens": 577594655.0, + "step": 15844 + }, + { + "epoch": 2.9424326833797587, + "grad_norm": 1.5722222328186035, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8687745332717896, + "num_tokens": 577636541.0, + "step": 15845 + }, + { + "epoch": 2.942618384401114, + "grad_norm": 1.5281779766082764, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8945004940032959, + "num_tokens": 577672929.0, + "step": 15846 + }, + { + "epoch": 2.9428040854224697, + "grad_norm": 1.5461806058883667, + "learning_rate": 1e-06, + "loss": 0.2739, + "mean_token_accuracy": 0.9000740051269531, + "num_tokens": 577707637.0, + "step": 15847 + }, + { + "epoch": 2.9429897864438255, + "grad_norm": 1.5243732929229736, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8933342695236206, + "num_tokens": 577746657.0, + "step": 15848 + }, + { + "epoch": 2.943175487465181, + "grad_norm": 1.5496153831481934, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.8982841968536377, + "num_tokens": 577782228.0, + "step": 15849 + }, + { + "epoch": 2.943361188486537, + "grad_norm": 1.5713614225387573, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8892099857330322, + "num_tokens": 577817666.0, + "step": 15850 + }, + { + "epoch": 2.9435468895078922, + "grad_norm": 1.5598323345184326, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8764503002166748, + "num_tokens": 577856942.0, + "step": 15851 + }, + { + "epoch": 2.943732590529248, + "grad_norm": 1.7602649927139282, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8941066265106201, + "num_tokens": 577887233.0, + "step": 15852 + }, + { + "epoch": 2.9439182915506037, + "grad_norm": 1.6393849849700928, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8762099146842957, + "num_tokens": 577928752.0, + "step": 15853 + }, + { + "epoch": 2.944103992571959, + "grad_norm": 1.5963724851608276, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.886722207069397, + "num_tokens": 577961775.0, + "step": 15854 + }, + { + "epoch": 2.9442896935933147, + "grad_norm": 1.6253472566604614, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8876059651374817, + "num_tokens": 577997398.0, + "step": 15855 + }, + { + "epoch": 2.9444753946146704, + "grad_norm": 1.7356867790222168, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8918442726135254, + "num_tokens": 578029925.0, + "step": 15856 + }, + { + "epoch": 2.944661095636026, + "grad_norm": 1.4954241514205933, + "learning_rate": 1e-06, + "loss": 0.2752, + "mean_token_accuracy": 0.9001823663711548, + "num_tokens": 578068938.0, + "step": 15857 + }, + { + "epoch": 2.9448467966573815, + "grad_norm": 1.549670934677124, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8823113441467285, + "num_tokens": 578105494.0, + "step": 15858 + }, + { + "epoch": 2.945032497678737, + "grad_norm": 1.5550751686096191, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8845275640487671, + "num_tokens": 578144196.0, + "step": 15859 + }, + { + "epoch": 2.945218198700093, + "grad_norm": 1.594275712966919, + "learning_rate": 1e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.9064550995826721, + "num_tokens": 578176710.0, + "step": 15860 + }, + { + "epoch": 2.9454038997214482, + "grad_norm": 1.4651870727539062, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8928184509277344, + "num_tokens": 578218311.0, + "step": 15861 + }, + { + "epoch": 2.945589600742804, + "grad_norm": 1.4567383527755737, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8949762582778931, + "num_tokens": 578261855.0, + "step": 15862 + }, + { + "epoch": 2.9457753017641597, + "grad_norm": 1.735848307609558, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8656395077705383, + "num_tokens": 578296537.0, + "step": 15863 + }, + { + "epoch": 2.9459610027855154, + "grad_norm": 1.7282589673995972, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8880233764648438, + "num_tokens": 578331803.0, + "step": 15864 + }, + { + "epoch": 2.946146703806871, + "grad_norm": 1.5798449516296387, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8855249881744385, + "num_tokens": 578370278.0, + "step": 15865 + }, + { + "epoch": 2.9463324048282264, + "grad_norm": 1.688045859336853, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8749985694885254, + "num_tokens": 578401810.0, + "step": 15866 + }, + { + "epoch": 2.946518105849582, + "grad_norm": 1.6298816204071045, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8873980641365051, + "num_tokens": 578435596.0, + "step": 15867 + }, + { + "epoch": 2.946703806870938, + "grad_norm": 1.5904194116592407, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8727313280105591, + "num_tokens": 578477691.0, + "step": 15868 + }, + { + "epoch": 2.946889507892293, + "grad_norm": 2.2182130813598633, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8969788551330566, + "num_tokens": 578512978.0, + "step": 15869 + }, + { + "epoch": 2.947075208913649, + "grad_norm": 1.6514863967895508, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8888493180274963, + "num_tokens": 578549798.0, + "step": 15870 + }, + { + "epoch": 2.9472609099350047, + "grad_norm": 1.6205068826675415, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.88722825050354, + "num_tokens": 578587260.0, + "step": 15871 + }, + { + "epoch": 2.9474466109563604, + "grad_norm": 1.6429767608642578, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8864181041717529, + "num_tokens": 578622320.0, + "step": 15872 + }, + { + "epoch": 2.947632311977716, + "grad_norm": 1.4719802141189575, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8884900808334351, + "num_tokens": 578659555.0, + "step": 15873 + }, + { + "epoch": 2.9478180129990714, + "grad_norm": 1.7073297500610352, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8868274688720703, + "num_tokens": 578692018.0, + "step": 15874 + }, + { + "epoch": 2.948003714020427, + "grad_norm": 1.5387427806854248, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8893008828163147, + "num_tokens": 578731645.0, + "step": 15875 + }, + { + "epoch": 2.948189415041783, + "grad_norm": 1.5819718837738037, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8963725566864014, + "num_tokens": 578766587.0, + "step": 15876 + }, + { + "epoch": 2.948375116063138, + "grad_norm": 1.5574944019317627, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8930386304855347, + "num_tokens": 578801983.0, + "step": 15877 + }, + { + "epoch": 2.948560817084494, + "grad_norm": 1.5661909580230713, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8821408748626709, + "num_tokens": 578839924.0, + "step": 15878 + }, + { + "epoch": 2.9487465181058496, + "grad_norm": 1.5341440439224243, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8834240436553955, + "num_tokens": 578877314.0, + "step": 15879 + }, + { + "epoch": 2.9489322191272054, + "grad_norm": 1.8190550804138184, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8759006857872009, + "num_tokens": 578908278.0, + "step": 15880 + }, + { + "epoch": 2.9491179201485607, + "grad_norm": 1.563800573348999, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8897995352745056, + "num_tokens": 578946103.0, + "step": 15881 + }, + { + "epoch": 2.9493036211699164, + "grad_norm": 1.6664698123931885, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8872636556625366, + "num_tokens": 578977783.0, + "step": 15882 + }, + { + "epoch": 2.949489322191272, + "grad_norm": 1.6084890365600586, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8941822052001953, + "num_tokens": 579012914.0, + "step": 15883 + }, + { + "epoch": 2.9496750232126274, + "grad_norm": 1.6440293788909912, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8727013468742371, + "num_tokens": 579054983.0, + "step": 15884 + }, + { + "epoch": 2.949860724233983, + "grad_norm": 1.7734575271606445, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.888554573059082, + "num_tokens": 579088687.0, + "step": 15885 + }, + { + "epoch": 2.950046425255339, + "grad_norm": 1.6624562740325928, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8881858587265015, + "num_tokens": 579121367.0, + "step": 15886 + }, + { + "epoch": 2.9502321262766946, + "grad_norm": 1.5686064958572388, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8740477561950684, + "num_tokens": 579161158.0, + "step": 15887 + }, + { + "epoch": 2.9504178272980504, + "grad_norm": 1.6776140928268433, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.876366376876831, + "num_tokens": 579201003.0, + "step": 15888 + }, + { + "epoch": 2.9506035283194056, + "grad_norm": 1.66441810131073, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8813985586166382, + "num_tokens": 579234980.0, + "step": 15889 + }, + { + "epoch": 2.9507892293407614, + "grad_norm": 1.5318691730499268, + "learning_rate": 1e-06, + "loss": 0.2569, + "mean_token_accuracy": 0.9068248867988586, + "num_tokens": 579271960.0, + "step": 15890 + }, + { + "epoch": 2.950974930362117, + "grad_norm": 1.581983208656311, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9006973505020142, + "num_tokens": 579306298.0, + "step": 15891 + }, + { + "epoch": 2.9511606313834724, + "grad_norm": 1.794323205947876, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8731540441513062, + "num_tokens": 579339661.0, + "step": 15892 + }, + { + "epoch": 2.951346332404828, + "grad_norm": 1.5627321004867554, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8864414095878601, + "num_tokens": 579377068.0, + "step": 15893 + }, + { + "epoch": 2.951532033426184, + "grad_norm": 1.5310826301574707, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8928236961364746, + "num_tokens": 579415114.0, + "step": 15894 + }, + { + "epoch": 2.9517177344475396, + "grad_norm": 1.6993781328201294, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.864560604095459, + "num_tokens": 579455580.0, + "step": 15895 + }, + { + "epoch": 2.9519034354688953, + "grad_norm": 1.6108659505844116, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8894675970077515, + "num_tokens": 579489743.0, + "step": 15896 + }, + { + "epoch": 2.9520891364902506, + "grad_norm": 1.6280702352523804, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8927958011627197, + "num_tokens": 579521128.0, + "step": 15897 + }, + { + "epoch": 2.9522748375116064, + "grad_norm": 1.7341119050979614, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8860747218132019, + "num_tokens": 579551659.0, + "step": 15898 + }, + { + "epoch": 2.952460538532962, + "grad_norm": 1.79750394821167, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.890720009803772, + "num_tokens": 579581493.0, + "step": 15899 + }, + { + "epoch": 2.9526462395543174, + "grad_norm": 1.5753520727157593, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8807837963104248, + "num_tokens": 579620782.0, + "step": 15900 + }, + { + "epoch": 2.952831940575673, + "grad_norm": 1.5473331212997437, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8885167837142944, + "num_tokens": 579657333.0, + "step": 15901 + }, + { + "epoch": 2.953017641597029, + "grad_norm": 1.6343894004821777, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8959382772445679, + "num_tokens": 579692011.0, + "step": 15902 + }, + { + "epoch": 2.9532033426183846, + "grad_norm": 1.7246971130371094, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8886257410049438, + "num_tokens": 579724008.0, + "step": 15903 + }, + { + "epoch": 2.95338904363974, + "grad_norm": 1.6310899257659912, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8802393674850464, + "num_tokens": 579764498.0, + "step": 15904 + }, + { + "epoch": 2.9535747446610956, + "grad_norm": 1.603816270828247, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8819493055343628, + "num_tokens": 579801641.0, + "step": 15905 + }, + { + "epoch": 2.9537604456824513, + "grad_norm": 1.578871726989746, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8956221342086792, + "num_tokens": 579837006.0, + "step": 15906 + }, + { + "epoch": 2.9539461467038066, + "grad_norm": 1.6051206588745117, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8791677951812744, + "num_tokens": 579875568.0, + "step": 15907 + }, + { + "epoch": 2.9541318477251624, + "grad_norm": 1.7658114433288574, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8925848603248596, + "num_tokens": 579908221.0, + "step": 15908 + }, + { + "epoch": 2.954317548746518, + "grad_norm": 1.468177318572998, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8959459066390991, + "num_tokens": 579948291.0, + "step": 15909 + }, + { + "epoch": 2.954503249767874, + "grad_norm": 1.4781222343444824, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8904219269752502, + "num_tokens": 579985960.0, + "step": 15910 + }, + { + "epoch": 2.9546889507892296, + "grad_norm": 1.6068177223205566, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8899563550949097, + "num_tokens": 580020029.0, + "step": 15911 + }, + { + "epoch": 2.954874651810585, + "grad_norm": 1.7580530643463135, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8854002356529236, + "num_tokens": 580052402.0, + "step": 15912 + }, + { + "epoch": 2.9550603528319406, + "grad_norm": 1.712408185005188, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8923190832138062, + "num_tokens": 580084554.0, + "step": 15913 + }, + { + "epoch": 2.9552460538532963, + "grad_norm": 1.6521860361099243, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8807594776153564, + "num_tokens": 580120412.0, + "step": 15914 + }, + { + "epoch": 2.9554317548746516, + "grad_norm": 1.5020843744277954, + "learning_rate": 1e-06, + "loss": 0.2768, + "mean_token_accuracy": 0.8977354168891907, + "num_tokens": 580156857.0, + "step": 15915 + }, + { + "epoch": 2.9556174558960073, + "grad_norm": 1.8386800289154053, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8793220520019531, + "num_tokens": 580187301.0, + "step": 15916 + }, + { + "epoch": 2.955803156917363, + "grad_norm": 1.480303406715393, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8939062356948853, + "num_tokens": 580228059.0, + "step": 15917 + }, + { + "epoch": 2.955988857938719, + "grad_norm": 1.5256285667419434, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8853927254676819, + "num_tokens": 580268331.0, + "step": 15918 + }, + { + "epoch": 2.9561745589600745, + "grad_norm": 1.627629041671753, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8819509744644165, + "num_tokens": 580304685.0, + "step": 15919 + }, + { + "epoch": 2.95636025998143, + "grad_norm": 1.4490643739700317, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8911710977554321, + "num_tokens": 580345638.0, + "step": 15920 + }, + { + "epoch": 2.9565459610027855, + "grad_norm": 1.549879550933838, + "learning_rate": 1e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.896919310092926, + "num_tokens": 580378920.0, + "step": 15921 + }, + { + "epoch": 2.9567316620241413, + "grad_norm": 1.626097559928894, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8922157287597656, + "num_tokens": 580412050.0, + "step": 15922 + }, + { + "epoch": 2.9569173630454966, + "grad_norm": 1.7466659545898438, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8891490697860718, + "num_tokens": 580443751.0, + "step": 15923 + }, + { + "epoch": 2.9571030640668523, + "grad_norm": 1.7914577722549438, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8808822631835938, + "num_tokens": 580475378.0, + "step": 15924 + }, + { + "epoch": 2.957288765088208, + "grad_norm": 1.6144059896469116, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8814905285835266, + "num_tokens": 580511083.0, + "step": 15925 + }, + { + "epoch": 2.9574744661095638, + "grad_norm": 1.5236871242523193, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8943317532539368, + "num_tokens": 580548308.0, + "step": 15926 + }, + { + "epoch": 2.957660167130919, + "grad_norm": 1.5707992315292358, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8886024951934814, + "num_tokens": 580586904.0, + "step": 15927 + }, + { + "epoch": 2.957845868152275, + "grad_norm": 1.6776949167251587, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8818914890289307, + "num_tokens": 580624844.0, + "step": 15928 + }, + { + "epoch": 2.9580315691736305, + "grad_norm": 1.5813634395599365, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8894838094711304, + "num_tokens": 580662094.0, + "step": 15929 + }, + { + "epoch": 2.958217270194986, + "grad_norm": 1.6092323064804077, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8818627595901489, + "num_tokens": 580701214.0, + "step": 15930 + }, + { + "epoch": 2.9584029712163415, + "grad_norm": 1.669706106185913, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.885360062122345, + "num_tokens": 580736288.0, + "step": 15931 + }, + { + "epoch": 2.9585886722376973, + "grad_norm": 1.4201641082763672, + "learning_rate": 1e-06, + "loss": 0.2577, + "mean_token_accuracy": 0.9084429144859314, + "num_tokens": 580774520.0, + "step": 15932 + }, + { + "epoch": 2.958774373259053, + "grad_norm": 1.388819694519043, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9018952250480652, + "num_tokens": 580816467.0, + "step": 15933 + }, + { + "epoch": 2.9589600742804087, + "grad_norm": 1.5953893661499023, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8875943422317505, + "num_tokens": 580852911.0, + "step": 15934 + }, + { + "epoch": 2.959145775301764, + "grad_norm": 1.3729705810546875, + "learning_rate": 1e-06, + "loss": 0.2582, + "mean_token_accuracy": 0.9056107997894287, + "num_tokens": 580896613.0, + "step": 15935 + }, + { + "epoch": 2.9593314763231198, + "grad_norm": 1.6066265106201172, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.9005553722381592, + "num_tokens": 580927565.0, + "step": 15936 + }, + { + "epoch": 2.9595171773444755, + "grad_norm": 1.604015588760376, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.886826753616333, + "num_tokens": 580964129.0, + "step": 15937 + }, + { + "epoch": 2.959702878365831, + "grad_norm": 1.6315999031066895, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8919404745101929, + "num_tokens": 580995321.0, + "step": 15938 + }, + { + "epoch": 2.9598885793871865, + "grad_norm": 1.4908335208892822, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8935487270355225, + "num_tokens": 581036804.0, + "step": 15939 + }, + { + "epoch": 2.9600742804085423, + "grad_norm": 1.6456197500228882, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8945738077163696, + "num_tokens": 581066610.0, + "step": 15940 + }, + { + "epoch": 2.960259981429898, + "grad_norm": 1.8445273637771606, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8831201195716858, + "num_tokens": 581095527.0, + "step": 15941 + }, + { + "epoch": 2.9604456824512537, + "grad_norm": 1.5669920444488525, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8787189722061157, + "num_tokens": 581136499.0, + "step": 15942 + }, + { + "epoch": 2.960631383472609, + "grad_norm": 1.4107950925827026, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8980250358581543, + "num_tokens": 581176676.0, + "step": 15943 + }, + { + "epoch": 2.9608170844939647, + "grad_norm": 1.755807876586914, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8862651586532593, + "num_tokens": 581209032.0, + "step": 15944 + }, + { + "epoch": 2.9610027855153205, + "grad_norm": 1.730268955230713, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8698649406433105, + "num_tokens": 581243966.0, + "step": 15945 + }, + { + "epoch": 2.9611884865366758, + "grad_norm": 1.4933642148971558, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8949403166770935, + "num_tokens": 581282870.0, + "step": 15946 + }, + { + "epoch": 2.9613741875580315, + "grad_norm": 1.6788979768753052, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8798593878746033, + "num_tokens": 581319230.0, + "step": 15947 + }, + { + "epoch": 2.9615598885793872, + "grad_norm": 1.5993061065673828, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8841935992240906, + "num_tokens": 581355535.0, + "step": 15948 + }, + { + "epoch": 2.961745589600743, + "grad_norm": 1.6205940246582031, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8828411102294922, + "num_tokens": 581391261.0, + "step": 15949 + }, + { + "epoch": 2.9619312906220983, + "grad_norm": 1.5797535181045532, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8868210315704346, + "num_tokens": 581428431.0, + "step": 15950 + }, + { + "epoch": 2.962116991643454, + "grad_norm": 1.5253984928131104, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8908222913742065, + "num_tokens": 581467340.0, + "step": 15951 + }, + { + "epoch": 2.9623026926648097, + "grad_norm": 1.6023856401443481, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8840845823287964, + "num_tokens": 581502444.0, + "step": 15952 + }, + { + "epoch": 2.962488393686165, + "grad_norm": 1.5993375778198242, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.876733124256134, + "num_tokens": 581538037.0, + "step": 15953 + }, + { + "epoch": 2.9626740947075207, + "grad_norm": 1.5407915115356445, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.882156491279602, + "num_tokens": 581578648.0, + "step": 15954 + }, + { + "epoch": 2.9628597957288765, + "grad_norm": 1.5685534477233887, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8892674446105957, + "num_tokens": 581615782.0, + "step": 15955 + }, + { + "epoch": 2.963045496750232, + "grad_norm": 1.571102261543274, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8697623014450073, + "num_tokens": 581656114.0, + "step": 15956 + }, + { + "epoch": 2.963231197771588, + "grad_norm": 1.8848509788513184, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8705079555511475, + "num_tokens": 581686267.0, + "step": 15957 + }, + { + "epoch": 2.9634168987929432, + "grad_norm": 1.4662076234817505, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8935784101486206, + "num_tokens": 581726216.0, + "step": 15958 + }, + { + "epoch": 2.963602599814299, + "grad_norm": 1.540383219718933, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8916020393371582, + "num_tokens": 581765206.0, + "step": 15959 + }, + { + "epoch": 2.9637883008356547, + "grad_norm": 1.7870711088180542, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8906802535057068, + "num_tokens": 581794644.0, + "step": 15960 + }, + { + "epoch": 2.96397400185701, + "grad_norm": 1.6346217393875122, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8918486833572388, + "num_tokens": 581832478.0, + "step": 15961 + }, + { + "epoch": 2.9641597028783657, + "grad_norm": 1.679598331451416, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8832345008850098, + "num_tokens": 581864432.0, + "step": 15962 + }, + { + "epoch": 2.9643454038997215, + "grad_norm": 1.751076340675354, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8753835558891296, + "num_tokens": 581897017.0, + "step": 15963 + }, + { + "epoch": 2.964531104921077, + "grad_norm": 1.7551610469818115, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8830023407936096, + "num_tokens": 581930247.0, + "step": 15964 + }, + { + "epoch": 2.964716805942433, + "grad_norm": 1.6861730813980103, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8849797248840332, + "num_tokens": 581964087.0, + "step": 15965 + }, + { + "epoch": 2.964902506963788, + "grad_norm": 1.7027347087860107, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8949184417724609, + "num_tokens": 581995392.0, + "step": 15966 + }, + { + "epoch": 2.965088207985144, + "grad_norm": 1.5879700183868408, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8821430206298828, + "num_tokens": 582033146.0, + "step": 15967 + }, + { + "epoch": 2.9652739090064997, + "grad_norm": 1.4408937692642212, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8851385116577148, + "num_tokens": 582076464.0, + "step": 15968 + }, + { + "epoch": 2.965459610027855, + "grad_norm": 1.5233880281448364, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8851218223571777, + "num_tokens": 582116021.0, + "step": 15969 + }, + { + "epoch": 2.9656453110492107, + "grad_norm": 1.5999679565429688, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8856585025787354, + "num_tokens": 582154628.0, + "step": 15970 + }, + { + "epoch": 2.9658310120705664, + "grad_norm": 1.6165906190872192, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8913390040397644, + "num_tokens": 582191601.0, + "step": 15971 + }, + { + "epoch": 2.966016713091922, + "grad_norm": 1.6703072786331177, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8935713171958923, + "num_tokens": 582222450.0, + "step": 15972 + }, + { + "epoch": 2.966202414113278, + "grad_norm": 1.670849323272705, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8877387046813965, + "num_tokens": 582255321.0, + "step": 15973 + }, + { + "epoch": 2.966388115134633, + "grad_norm": 1.6333184242248535, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8932886123657227, + "num_tokens": 582291620.0, + "step": 15974 + }, + { + "epoch": 2.966573816155989, + "grad_norm": 1.7311248779296875, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8876833319664001, + "num_tokens": 582322740.0, + "step": 15975 + }, + { + "epoch": 2.966759517177344, + "grad_norm": 1.679830551147461, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8914812803268433, + "num_tokens": 582355620.0, + "step": 15976 + }, + { + "epoch": 2.9669452181987, + "grad_norm": 1.5496658086776733, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8782062530517578, + "num_tokens": 582396378.0, + "step": 15977 + }, + { + "epoch": 2.9671309192200557, + "grad_norm": 1.529058814048767, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8840954303741455, + "num_tokens": 582433684.0, + "step": 15978 + }, + { + "epoch": 2.9673166202414114, + "grad_norm": 1.5480420589447021, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8963944911956787, + "num_tokens": 582472357.0, + "step": 15979 + }, + { + "epoch": 2.967502321262767, + "grad_norm": 1.73835289478302, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8930306434631348, + "num_tokens": 582504465.0, + "step": 15980 + }, + { + "epoch": 2.9676880222841224, + "grad_norm": 1.6527817249298096, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8859058618545532, + "num_tokens": 582537583.0, + "step": 15981 + }, + { + "epoch": 2.967873723305478, + "grad_norm": 1.4442113637924194, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8936904668807983, + "num_tokens": 582578729.0, + "step": 15982 + }, + { + "epoch": 2.968059424326834, + "grad_norm": 1.6096444129943848, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8890339136123657, + "num_tokens": 582614315.0, + "step": 15983 + }, + { + "epoch": 2.968245125348189, + "grad_norm": 1.5238769054412842, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8796667456626892, + "num_tokens": 582654043.0, + "step": 15984 + }, + { + "epoch": 2.968430826369545, + "grad_norm": 1.7884873151779175, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8672897219657898, + "num_tokens": 582697816.0, + "step": 15985 + }, + { + "epoch": 2.9686165273909007, + "grad_norm": 1.46133291721344, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8954052925109863, + "num_tokens": 582738496.0, + "step": 15986 + }, + { + "epoch": 2.9688022284122564, + "grad_norm": 1.4715389013290405, + "learning_rate": 1e-06, + "loss": 0.2847, + "mean_token_accuracy": 0.8955234289169312, + "num_tokens": 582779629.0, + "step": 15987 + }, + { + "epoch": 2.968987929433612, + "grad_norm": 1.5406345129013062, + "learning_rate": 1e-06, + "loss": 0.2518, + "mean_token_accuracy": 0.9099497199058533, + "num_tokens": 582811947.0, + "step": 15988 + }, + { + "epoch": 2.9691736304549674, + "grad_norm": 1.6348267793655396, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8952951431274414, + "num_tokens": 582843615.0, + "step": 15989 + }, + { + "epoch": 2.969359331476323, + "grad_norm": 1.6347547769546509, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8822416067123413, + "num_tokens": 582877750.0, + "step": 15990 + }, + { + "epoch": 2.969545032497679, + "grad_norm": 1.5372177362442017, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8878598809242249, + "num_tokens": 582913995.0, + "step": 15991 + }, + { + "epoch": 2.969730733519034, + "grad_norm": 1.6330827474594116, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8872886896133423, + "num_tokens": 582947905.0, + "step": 15992 + }, + { + "epoch": 2.96991643454039, + "grad_norm": 1.4732431173324585, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8828288912773132, + "num_tokens": 582993436.0, + "step": 15993 + }, + { + "epoch": 2.9701021355617456, + "grad_norm": 1.7144900560379028, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8967580795288086, + "num_tokens": 583029353.0, + "step": 15994 + }, + { + "epoch": 2.9702878365831014, + "grad_norm": 1.6334224939346313, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8778021931648254, + "num_tokens": 583068637.0, + "step": 15995 + }, + { + "epoch": 2.970473537604457, + "grad_norm": 1.5900071859359741, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.892653226852417, + "num_tokens": 583105052.0, + "step": 15996 + }, + { + "epoch": 2.9706592386258124, + "grad_norm": 1.7332135438919067, + "learning_rate": 1e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.8995485305786133, + "num_tokens": 583134702.0, + "step": 15997 + }, + { + "epoch": 2.970844939647168, + "grad_norm": 1.520095705986023, + "learning_rate": 1e-06, + "loss": 0.2746, + "mean_token_accuracy": 0.9031186103820801, + "num_tokens": 583171340.0, + "step": 15998 + }, + { + "epoch": 2.9710306406685234, + "grad_norm": 1.692713975906372, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8893980979919434, + "num_tokens": 583206091.0, + "step": 15999 + }, + { + "epoch": 2.971216341689879, + "grad_norm": 1.6817870140075684, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8960842490196228, + "num_tokens": 583241982.0, + "step": 16000 + }, + { + "epoch": 2.971402042711235, + "grad_norm": 1.6347436904907227, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.8973426818847656, + "num_tokens": 583278281.0, + "step": 16001 + }, + { + "epoch": 2.9715877437325906, + "grad_norm": 1.6517869234085083, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8805068731307983, + "num_tokens": 583312555.0, + "step": 16002 + }, + { + "epoch": 2.9717734447539463, + "grad_norm": 1.7566889524459839, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8842744827270508, + "num_tokens": 583344064.0, + "step": 16003 + }, + { + "epoch": 2.9719591457753016, + "grad_norm": 1.6683858633041382, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8919862508773804, + "num_tokens": 583379648.0, + "step": 16004 + }, + { + "epoch": 2.9721448467966574, + "grad_norm": 1.6304038763046265, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8849104046821594, + "num_tokens": 583419260.0, + "step": 16005 + }, + { + "epoch": 2.972330547818013, + "grad_norm": 1.5560282468795776, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.885755181312561, + "num_tokens": 583460083.0, + "step": 16006 + }, + { + "epoch": 2.9725162488393684, + "grad_norm": 1.8040441274642944, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8796373605728149, + "num_tokens": 583490853.0, + "step": 16007 + }, + { + "epoch": 2.972701949860724, + "grad_norm": 1.5141730308532715, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8792605400085449, + "num_tokens": 583531778.0, + "step": 16008 + }, + { + "epoch": 2.97288765088208, + "grad_norm": 1.5193660259246826, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8897733688354492, + "num_tokens": 583574525.0, + "step": 16009 + }, + { + "epoch": 2.9730733519034356, + "grad_norm": 1.6775996685028076, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8812201023101807, + "num_tokens": 583610278.0, + "step": 16010 + }, + { + "epoch": 2.9732590529247913, + "grad_norm": 1.654746413230896, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8905812501907349, + "num_tokens": 583641288.0, + "step": 16011 + }, + { + "epoch": 2.9734447539461466, + "grad_norm": 1.7291646003723145, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8844412565231323, + "num_tokens": 583673692.0, + "step": 16012 + }, + { + "epoch": 2.9736304549675023, + "grad_norm": 1.6395448446273804, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8871583938598633, + "num_tokens": 583710623.0, + "step": 16013 + }, + { + "epoch": 2.973816155988858, + "grad_norm": 1.660895586013794, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8900008201599121, + "num_tokens": 583744144.0, + "step": 16014 + }, + { + "epoch": 2.9740018570102134, + "grad_norm": 1.7662303447723389, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8793717622756958, + "num_tokens": 583772906.0, + "step": 16015 + }, + { + "epoch": 2.974187558031569, + "grad_norm": 1.562021255493164, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.890447735786438, + "num_tokens": 583811709.0, + "step": 16016 + }, + { + "epoch": 2.974373259052925, + "grad_norm": 1.608229637145996, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8827575445175171, + "num_tokens": 583846304.0, + "step": 16017 + }, + { + "epoch": 2.9745589600742806, + "grad_norm": 1.526064157485962, + "learning_rate": 1e-06, + "loss": 0.2534, + "mean_token_accuracy": 0.9056121110916138, + "num_tokens": 583878733.0, + "step": 16018 + }, + { + "epoch": 2.9747446610956363, + "grad_norm": 1.4980438947677612, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8842334747314453, + "num_tokens": 583920877.0, + "step": 16019 + }, + { + "epoch": 2.9749303621169916, + "grad_norm": 1.4939993619918823, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8942817449569702, + "num_tokens": 583956421.0, + "step": 16020 + }, + { + "epoch": 2.9751160631383473, + "grad_norm": 1.6172418594360352, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8931635022163391, + "num_tokens": 583996538.0, + "step": 16021 + }, + { + "epoch": 2.975301764159703, + "grad_norm": 1.498183250427246, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8887355923652649, + "num_tokens": 584038569.0, + "step": 16022 + }, + { + "epoch": 2.9754874651810583, + "grad_norm": 1.4567502737045288, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8949377536773682, + "num_tokens": 584076924.0, + "step": 16023 + }, + { + "epoch": 2.975673166202414, + "grad_norm": 1.6622450351715088, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8831268548965454, + "num_tokens": 584113787.0, + "step": 16024 + }, + { + "epoch": 2.97585886722377, + "grad_norm": 1.6230448484420776, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8723775148391724, + "num_tokens": 584150959.0, + "step": 16025 + }, + { + "epoch": 2.9760445682451255, + "grad_norm": 1.6692367792129517, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.8997369408607483, + "num_tokens": 584181151.0, + "step": 16026 + }, + { + "epoch": 2.976230269266481, + "grad_norm": 1.426094651222229, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.897240400314331, + "num_tokens": 584220035.0, + "step": 16027 + }, + { + "epoch": 2.9764159702878366, + "grad_norm": 1.6320841312408447, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8762816190719604, + "num_tokens": 584256309.0, + "step": 16028 + }, + { + "epoch": 2.9766016713091923, + "grad_norm": 1.4466787576675415, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8999288082122803, + "num_tokens": 584298284.0, + "step": 16029 + }, + { + "epoch": 2.9767873723305476, + "grad_norm": 1.55617356300354, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8983142375946045, + "num_tokens": 584335571.0, + "step": 16030 + }, + { + "epoch": 2.9769730733519033, + "grad_norm": 1.6493053436279297, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8751620054244995, + "num_tokens": 584373571.0, + "step": 16031 + }, + { + "epoch": 2.977158774373259, + "grad_norm": 1.6280477046966553, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8762727379798889, + "num_tokens": 584409903.0, + "step": 16032 + }, + { + "epoch": 2.9773444753946148, + "grad_norm": 1.6000783443450928, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8832930326461792, + "num_tokens": 584444483.0, + "step": 16033 + }, + { + "epoch": 2.9775301764159705, + "grad_norm": 1.5631990432739258, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8834403157234192, + "num_tokens": 584482759.0, + "step": 16034 + }, + { + "epoch": 2.977715877437326, + "grad_norm": 1.5199458599090576, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8896146416664124, + "num_tokens": 584520169.0, + "step": 16035 + }, + { + "epoch": 2.9779015784586815, + "grad_norm": 1.6312021017074585, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8709461092948914, + "num_tokens": 584557566.0, + "step": 16036 + }, + { + "epoch": 2.9780872794800373, + "grad_norm": 1.7304531335830688, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8747761845588684, + "num_tokens": 584590088.0, + "step": 16037 + }, + { + "epoch": 2.9782729805013926, + "grad_norm": 1.6190035343170166, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8862380981445312, + "num_tokens": 584624530.0, + "step": 16038 + }, + { + "epoch": 2.9784586815227483, + "grad_norm": 1.4833904504776, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.895796537399292, + "num_tokens": 584664577.0, + "step": 16039 + }, + { + "epoch": 2.978644382544104, + "grad_norm": 1.578047513961792, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8729445934295654, + "num_tokens": 584705058.0, + "step": 16040 + }, + { + "epoch": 2.9788300835654598, + "grad_norm": 1.7032047510147095, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8850799202919006, + "num_tokens": 584738605.0, + "step": 16041 + }, + { + "epoch": 2.9790157845868155, + "grad_norm": 1.662797451019287, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8933326005935669, + "num_tokens": 584774222.0, + "step": 16042 + }, + { + "epoch": 2.9792014856081708, + "grad_norm": 1.4186666011810303, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8884178400039673, + "num_tokens": 584819489.0, + "step": 16043 + }, + { + "epoch": 2.9793871866295265, + "grad_norm": 1.6768074035644531, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.868606448173523, + "num_tokens": 584856978.0, + "step": 16044 + }, + { + "epoch": 2.9795728876508822, + "grad_norm": 1.7005031108856201, + "learning_rate": 1e-06, + "loss": 0.2777, + "mean_token_accuracy": 0.8990064859390259, + "num_tokens": 584886038.0, + "step": 16045 + }, + { + "epoch": 2.9797585886722375, + "grad_norm": 1.5399123430252075, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8907822370529175, + "num_tokens": 584922778.0, + "step": 16046 + }, + { + "epoch": 2.9799442896935933, + "grad_norm": 1.6047202348709106, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8695554733276367, + "num_tokens": 584963834.0, + "step": 16047 + }, + { + "epoch": 2.980129990714949, + "grad_norm": 1.6266230344772339, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8896865844726562, + "num_tokens": 584997830.0, + "step": 16048 + }, + { + "epoch": 2.9803156917363047, + "grad_norm": 1.6196082830429077, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8899617791175842, + "num_tokens": 585033584.0, + "step": 16049 + }, + { + "epoch": 2.98050139275766, + "grad_norm": 1.6050204038619995, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8885633945465088, + "num_tokens": 585067556.0, + "step": 16050 + }, + { + "epoch": 2.9806870937790158, + "grad_norm": 1.6011968851089478, + "learning_rate": 1e-06, + "loss": 0.2673, + "mean_token_accuracy": 0.9031087160110474, + "num_tokens": 585102377.0, + "step": 16051 + }, + { + "epoch": 2.9808727948003715, + "grad_norm": 1.533669352531433, + "learning_rate": 1e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.9005876183509827, + "num_tokens": 585138087.0, + "step": 16052 + }, + { + "epoch": 2.9810584958217268, + "grad_norm": 1.5935158729553223, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8752341270446777, + "num_tokens": 585175761.0, + "step": 16053 + }, + { + "epoch": 2.9812441968430825, + "grad_norm": 1.5280084609985352, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8858474493026733, + "num_tokens": 585216640.0, + "step": 16054 + }, + { + "epoch": 2.9814298978644382, + "grad_norm": 1.6721441745758057, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8811947703361511, + "num_tokens": 585254322.0, + "step": 16055 + }, + { + "epoch": 2.981615598885794, + "grad_norm": 1.7352381944656372, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8874813318252563, + "num_tokens": 585288589.0, + "step": 16056 + }, + { + "epoch": 2.9818012999071497, + "grad_norm": 1.6418002843856812, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8956192135810852, + "num_tokens": 585321207.0, + "step": 16057 + }, + { + "epoch": 2.981987000928505, + "grad_norm": 1.8188564777374268, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8818225264549255, + "num_tokens": 585352344.0, + "step": 16058 + }, + { + "epoch": 2.9821727019498607, + "grad_norm": 1.599359154701233, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.887054443359375, + "num_tokens": 585389508.0, + "step": 16059 + }, + { + "epoch": 2.9823584029712165, + "grad_norm": 1.499761939048767, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8888071179389954, + "num_tokens": 585429492.0, + "step": 16060 + }, + { + "epoch": 2.9825441039925717, + "grad_norm": 1.7148083448410034, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8808445930480957, + "num_tokens": 585467246.0, + "step": 16061 + }, + { + "epoch": 2.9827298050139275, + "grad_norm": 1.6200432777404785, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8799993991851807, + "num_tokens": 585502863.0, + "step": 16062 + }, + { + "epoch": 2.982915506035283, + "grad_norm": 1.4439144134521484, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8972903490066528, + "num_tokens": 585542131.0, + "step": 16063 + }, + { + "epoch": 2.983101207056639, + "grad_norm": 1.6024757623672485, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8890584111213684, + "num_tokens": 585578061.0, + "step": 16064 + }, + { + "epoch": 2.9832869080779947, + "grad_norm": 1.4447989463806152, + "learning_rate": 1e-06, + "loss": 0.2627, + "mean_token_accuracy": 0.9067748188972473, + "num_tokens": 585616418.0, + "step": 16065 + }, + { + "epoch": 2.98347260909935, + "grad_norm": 1.7509857416152954, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.878607451915741, + "num_tokens": 585648134.0, + "step": 16066 + }, + { + "epoch": 2.9836583101207057, + "grad_norm": 1.6907308101654053, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8545054793357849, + "num_tokens": 585691623.0, + "step": 16067 + }, + { + "epoch": 2.9838440111420614, + "grad_norm": 1.5752499103546143, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8803409337997437, + "num_tokens": 585732139.0, + "step": 16068 + }, + { + "epoch": 2.9840297121634167, + "grad_norm": 1.5009962320327759, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8912736177444458, + "num_tokens": 585771048.0, + "step": 16069 + }, + { + "epoch": 2.9842154131847725, + "grad_norm": 1.4084994792938232, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8951946496963501, + "num_tokens": 585812414.0, + "step": 16070 + }, + { + "epoch": 2.984401114206128, + "grad_norm": 1.332866907119751, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8962488770484924, + "num_tokens": 585855584.0, + "step": 16071 + }, + { + "epoch": 2.984586815227484, + "grad_norm": 1.7081245183944702, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8848099708557129, + "num_tokens": 585891350.0, + "step": 16072 + }, + { + "epoch": 2.984772516248839, + "grad_norm": 1.6074482202529907, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8822658061981201, + "num_tokens": 585929387.0, + "step": 16073 + }, + { + "epoch": 2.984958217270195, + "grad_norm": 1.456556797027588, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8883211612701416, + "num_tokens": 585971975.0, + "step": 16074 + }, + { + "epoch": 2.9851439182915507, + "grad_norm": 1.546555519104004, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8962463140487671, + "num_tokens": 586011192.0, + "step": 16075 + }, + { + "epoch": 2.985329619312906, + "grad_norm": 1.5957108736038208, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.89293372631073, + "num_tokens": 586050502.0, + "step": 16076 + }, + { + "epoch": 2.9855153203342617, + "grad_norm": 1.7174774408340454, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8860436677932739, + "num_tokens": 586084430.0, + "step": 16077 + }, + { + "epoch": 2.9857010213556174, + "grad_norm": 1.6964226961135864, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8849039673805237, + "num_tokens": 586119301.0, + "step": 16078 + }, + { + "epoch": 2.985886722376973, + "grad_norm": 1.615337610244751, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8891372680664062, + "num_tokens": 586156020.0, + "step": 16079 + }, + { + "epoch": 2.986072423398329, + "grad_norm": 1.6710186004638672, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8903341293334961, + "num_tokens": 586190356.0, + "step": 16080 + }, + { + "epoch": 2.986258124419684, + "grad_norm": 1.5133882761001587, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8956954479217529, + "num_tokens": 586226896.0, + "step": 16081 + }, + { + "epoch": 2.98644382544104, + "grad_norm": 1.5883023738861084, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8806973695755005, + "num_tokens": 586267821.0, + "step": 16082 + }, + { + "epoch": 2.9866295264623957, + "grad_norm": 1.56658935546875, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8693819046020508, + "num_tokens": 586308816.0, + "step": 16083 + }, + { + "epoch": 2.986815227483751, + "grad_norm": 1.693168044090271, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8781243562698364, + "num_tokens": 586344480.0, + "step": 16084 + }, + { + "epoch": 2.9870009285051067, + "grad_norm": 1.4945638179779053, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.9010543823242188, + "num_tokens": 586383173.0, + "step": 16085 + }, + { + "epoch": 2.9871866295264624, + "grad_norm": 1.4843629598617554, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8875911235809326, + "num_tokens": 586425697.0, + "step": 16086 + }, + { + "epoch": 2.987372330547818, + "grad_norm": 1.6191328763961792, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8939073085784912, + "num_tokens": 586458643.0, + "step": 16087 + }, + { + "epoch": 2.987558031569174, + "grad_norm": 1.54669988155365, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.885735809803009, + "num_tokens": 586496293.0, + "step": 16088 + }, + { + "epoch": 2.987743732590529, + "grad_norm": 1.7113438844680786, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8834933042526245, + "num_tokens": 586528663.0, + "step": 16089 + }, + { + "epoch": 2.987929433611885, + "grad_norm": 1.58464515209198, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8869141340255737, + "num_tokens": 586565018.0, + "step": 16090 + }, + { + "epoch": 2.9881151346332406, + "grad_norm": 1.6908819675445557, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8872807025909424, + "num_tokens": 586597098.0, + "step": 16091 + }, + { + "epoch": 2.988300835654596, + "grad_norm": 1.5083494186401367, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8892884254455566, + "num_tokens": 586635461.0, + "step": 16092 + }, + { + "epoch": 2.9884865366759517, + "grad_norm": 1.6510590314865112, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.896185040473938, + "num_tokens": 586668389.0, + "step": 16093 + }, + { + "epoch": 2.9886722376973074, + "grad_norm": 1.6180031299591064, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.879455029964447, + "num_tokens": 586703891.0, + "step": 16094 + }, + { + "epoch": 2.988857938718663, + "grad_norm": 1.7702594995498657, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8802556991577148, + "num_tokens": 586735632.0, + "step": 16095 + }, + { + "epoch": 2.9890436397400184, + "grad_norm": 1.5994254350662231, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8756831884384155, + "num_tokens": 586775719.0, + "step": 16096 + }, + { + "epoch": 2.989229340761374, + "grad_norm": 1.4979803562164307, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8902509212493896, + "num_tokens": 586816712.0, + "step": 16097 + }, + { + "epoch": 2.98941504178273, + "grad_norm": 1.6224377155303955, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8845877647399902, + "num_tokens": 586849372.0, + "step": 16098 + }, + { + "epoch": 2.989600742804085, + "grad_norm": 1.5689374208450317, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8886169195175171, + "num_tokens": 586885503.0, + "step": 16099 + }, + { + "epoch": 2.989786443825441, + "grad_norm": 1.549564242362976, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8843132257461548, + "num_tokens": 586925358.0, + "step": 16100 + }, + { + "epoch": 2.9899721448467966, + "grad_norm": 1.7230463027954102, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8866379857063293, + "num_tokens": 586963089.0, + "step": 16101 + }, + { + "epoch": 2.9901578458681524, + "grad_norm": 1.5829013586044312, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8729708194732666, + "num_tokens": 587000515.0, + "step": 16102 + }, + { + "epoch": 2.990343546889508, + "grad_norm": 1.5493268966674805, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8929811716079712, + "num_tokens": 587041138.0, + "step": 16103 + }, + { + "epoch": 2.9905292479108634, + "grad_norm": 1.7036964893341064, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8729754686355591, + "num_tokens": 587080265.0, + "step": 16104 + }, + { + "epoch": 2.990714948932219, + "grad_norm": 1.6556396484375, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.883493185043335, + "num_tokens": 587116496.0, + "step": 16105 + }, + { + "epoch": 2.990900649953575, + "grad_norm": 1.5274755954742432, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.8986341953277588, + "num_tokens": 587151654.0, + "step": 16106 + }, + { + "epoch": 2.99108635097493, + "grad_norm": 1.664331078529358, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8750811815261841, + "num_tokens": 587189601.0, + "step": 16107 + }, + { + "epoch": 2.991272051996286, + "grad_norm": 1.7755850553512573, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8742687702178955, + "num_tokens": 587223194.0, + "step": 16108 + }, + { + "epoch": 2.9914577530176416, + "grad_norm": 1.6441315412521362, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8783420324325562, + "num_tokens": 587263084.0, + "step": 16109 + }, + { + "epoch": 2.9916434540389973, + "grad_norm": 1.5484131574630737, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8956468105316162, + "num_tokens": 587300565.0, + "step": 16110 + }, + { + "epoch": 2.991829155060353, + "grad_norm": 1.5297036170959473, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8901153802871704, + "num_tokens": 587336268.0, + "step": 16111 + }, + { + "epoch": 2.9920148560817084, + "grad_norm": 1.5464786291122437, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.89055335521698, + "num_tokens": 587373793.0, + "step": 16112 + }, + { + "epoch": 2.992200557103064, + "grad_norm": 1.5589308738708496, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8857623338699341, + "num_tokens": 587415297.0, + "step": 16113 + }, + { + "epoch": 2.99238625812442, + "grad_norm": 1.7768497467041016, + "learning_rate": 1e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.9016176462173462, + "num_tokens": 587443995.0, + "step": 16114 + }, + { + "epoch": 2.992571959145775, + "grad_norm": 1.7475471496582031, + "learning_rate": 1e-06, + "loss": 0.2728, + "mean_token_accuracy": 0.9018720388412476, + "num_tokens": 587471633.0, + "step": 16115 + }, + { + "epoch": 2.992757660167131, + "grad_norm": 1.5668079853057861, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8955522775650024, + "num_tokens": 587506346.0, + "step": 16116 + }, + { + "epoch": 2.9929433611884866, + "grad_norm": 1.506032109260559, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8936036825180054, + "num_tokens": 587544579.0, + "step": 16117 + }, + { + "epoch": 2.9931290622098423, + "grad_norm": 1.5088011026382446, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8913363218307495, + "num_tokens": 587582613.0, + "step": 16118 + }, + { + "epoch": 2.9933147632311976, + "grad_norm": 1.627886176109314, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8796988725662231, + "num_tokens": 587623546.0, + "step": 16119 + }, + { + "epoch": 2.9935004642525533, + "grad_norm": 1.615167260169983, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8896589279174805, + "num_tokens": 587662205.0, + "step": 16120 + }, + { + "epoch": 2.993686165273909, + "grad_norm": 1.4748692512512207, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.885022759437561, + "num_tokens": 587704878.0, + "step": 16121 + }, + { + "epoch": 2.9938718662952644, + "grad_norm": 1.7222850322723389, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8992125988006592, + "num_tokens": 587737921.0, + "step": 16122 + }, + { + "epoch": 2.99405756731662, + "grad_norm": 1.5977764129638672, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8905455470085144, + "num_tokens": 587773459.0, + "step": 16123 + }, + { + "epoch": 2.994243268337976, + "grad_norm": 1.569344162940979, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8922718167304993, + "num_tokens": 587809179.0, + "step": 16124 + }, + { + "epoch": 2.9944289693593316, + "grad_norm": 1.4428881406784058, + "learning_rate": 1e-06, + "loss": 0.2571, + "mean_token_accuracy": 0.9045464396476746, + "num_tokens": 587847448.0, + "step": 16125 + }, + { + "epoch": 2.9946146703806873, + "grad_norm": 1.6434621810913086, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8871367573738098, + "num_tokens": 587879986.0, + "step": 16126 + }, + { + "epoch": 2.9948003714020426, + "grad_norm": 1.6608046293258667, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8800076246261597, + "num_tokens": 587913713.0, + "step": 16127 + }, + { + "epoch": 2.9949860724233983, + "grad_norm": 1.6170998811721802, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8864619731903076, + "num_tokens": 587952616.0, + "step": 16128 + }, + { + "epoch": 2.995171773444754, + "grad_norm": 1.7735347747802734, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8875308036804199, + "num_tokens": 587988373.0, + "step": 16129 + }, + { + "epoch": 2.9953574744661093, + "grad_norm": 1.7415367364883423, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8780982494354248, + "num_tokens": 588025551.0, + "step": 16130 + }, + { + "epoch": 2.995543175487465, + "grad_norm": 1.4156053066253662, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8894441723823547, + "num_tokens": 588069493.0, + "step": 16131 + }, + { + "epoch": 2.995728876508821, + "grad_norm": 1.8780007362365723, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8938267827033997, + "num_tokens": 588095676.0, + "step": 16132 + }, + { + "epoch": 2.9959145775301765, + "grad_norm": 1.7850550413131714, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8736916780471802, + "num_tokens": 588129675.0, + "step": 16133 + }, + { + "epoch": 2.9961002785515323, + "grad_norm": 1.7118844985961914, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8867138624191284, + "num_tokens": 588160966.0, + "step": 16134 + }, + { + "epoch": 2.9962859795728876, + "grad_norm": 1.6425246000289917, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8760101795196533, + "num_tokens": 588199041.0, + "step": 16135 + }, + { + "epoch": 2.9964716805942433, + "grad_norm": 1.7717100381851196, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8932822346687317, + "num_tokens": 588231940.0, + "step": 16136 + }, + { + "epoch": 2.996657381615599, + "grad_norm": 1.6368290185928345, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8810197710990906, + "num_tokens": 588268218.0, + "step": 16137 + }, + { + "epoch": 2.9968430826369543, + "grad_norm": 1.681966781616211, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.878046989440918, + "num_tokens": 588304242.0, + "step": 16138 + }, + { + "epoch": 2.99702878365831, + "grad_norm": 1.6261589527130127, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8893080353736877, + "num_tokens": 588340420.0, + "step": 16139 + }, + { + "epoch": 2.997214484679666, + "grad_norm": 1.5406392812728882, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.892128050327301, + "num_tokens": 588380128.0, + "step": 16140 + }, + { + "epoch": 2.9974001857010215, + "grad_norm": 1.5853328704833984, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8801308274269104, + "num_tokens": 588417445.0, + "step": 16141 + }, + { + "epoch": 2.9975858867223772, + "grad_norm": 1.7570189237594604, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8890150785446167, + "num_tokens": 588446908.0, + "step": 16142 + }, + { + "epoch": 2.9977715877437325, + "grad_norm": 1.7748026847839355, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8925282955169678, + "num_tokens": 588478314.0, + "step": 16143 + }, + { + "epoch": 2.9979572887650883, + "grad_norm": 1.7612360715866089, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8676536679267883, + "num_tokens": 588513045.0, + "step": 16144 + }, + { + "epoch": 2.9981429897864436, + "grad_norm": 1.5823160409927368, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8951135277748108, + "num_tokens": 588550587.0, + "step": 16145 + }, + { + "epoch": 2.9983286908077993, + "grad_norm": 1.6610820293426514, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8869246244430542, + "num_tokens": 588584856.0, + "step": 16146 + }, + { + "epoch": 2.998514391829155, + "grad_norm": 1.4735394716262817, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8939156532287598, + "num_tokens": 588626792.0, + "step": 16147 + }, + { + "epoch": 2.9987000928505108, + "grad_norm": 1.700634241104126, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.885032057762146, + "num_tokens": 588659601.0, + "step": 16148 + }, + { + "epoch": 2.9988857938718665, + "grad_norm": 1.5416537523269653, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8902006149291992, + "num_tokens": 588696879.0, + "step": 16149 + }, + { + "epoch": 2.999071494893222, + "grad_norm": 1.8261802196502686, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8902369141578674, + "num_tokens": 588726028.0, + "step": 16150 + }, + { + "epoch": 2.9992571959145775, + "grad_norm": 1.656036615371704, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8840683698654175, + "num_tokens": 588762530.0, + "step": 16151 + }, + { + "epoch": 2.9994428969359332, + "grad_norm": 1.6115039587020874, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.8990951180458069, + "num_tokens": 588796619.0, + "step": 16152 + }, + { + "epoch": 2.9996285979572885, + "grad_norm": 1.7213068008422852, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8917307257652283, + "num_tokens": 588827471.0, + "step": 16153 + }, + { + "epoch": 2.9998142989786443, + "grad_norm": 1.5925647020339966, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8895106911659241, + "num_tokens": 588862721.0, + "step": 16154 + }, + { + "epoch": 3.0, + "grad_norm": 1.6200906038284302, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8867772817611694, + "num_tokens": 588899838.0, + "step": 16155 + }, + { + "epoch": 3.0, + "step": 16155, + "total_flos": 2.651789354410495e+19, + "train_loss": 0.3651657155564753, + "train_runtime": 23763.887, + "train_samples_per_second": 10.877, + "train_steps_per_second": 0.68 + } + ], + "logging_steps": 1, + "max_steps": 16155, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 8078, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.651789354410495e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..8725e63 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4546edc1734f55ef25de707218d430afb42cc8ed73bad1e37f41416d933ad31d +size 13329